In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv("/Users/prose/OneDrive/Desktop/Data/data.csv")
df.head()
Out[1]:
Unnamed: 0 ID Name Age Photo Nationality Flag Overall Potential Club ... Composure Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
0 0 158023 L. Messi 31 https://cdn.sofifa.org/players/4/19/158023.png Argentina https://cdn.sofifa.org/flags/52.png 94 94 FC Barcelona ... 96.0 33.0 28.0 26.0 6.0 11.0 15.0 14.0 8.0 €226.5M
1 1 20801 Cristiano Ronaldo 33 https://cdn.sofifa.org/players/4/19/20801.png Portugal https://cdn.sofifa.org/flags/38.png 94 94 Juventus ... 95.0 28.0 31.0 23.0 7.0 11.0 15.0 14.0 11.0 €127.1M
2 2 190871 Neymar Jr 26 https://cdn.sofifa.org/players/4/19/190871.png Brazil https://cdn.sofifa.org/flags/54.png 92 93 Paris Saint-Germain ... 94.0 27.0 24.0 33.0 9.0 9.0 15.0 15.0 11.0 €228.1M
3 3 193080 De Gea 27 https://cdn.sofifa.org/players/4/19/193080.png Spain https://cdn.sofifa.org/flags/45.png 91 93 Manchester United ... 68.0 15.0 21.0 13.0 90.0 85.0 87.0 88.0 94.0 €138.6M
4 4 192985 K. De Bruyne 27 https://cdn.sofifa.org/players/4/19/192985.png Belgium https://cdn.sofifa.org/flags/7.png 91 92 Manchester City ... 88.0 68.0 58.0 51.0 15.0 13.0 5.0 10.0 13.0 €196.4M

5 rows × 89 columns

In [2]:
#checking the column names
df.columns
Out[2]:
Index(['Unnamed: 0', 'ID', 'Name', 'Age', 'Photo', 'Nationality', 'Flag',
       'Overall', 'Potential', 'Club', 'Club Logo', 'Value', 'Wage', 'Special',
       'Preferred Foot', 'International Reputation', 'Weak Foot',
       'Skill Moves', 'Work Rate', 'Body Type', 'Real Face', 'Position',
       'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until',
       'Height', 'Weight', 'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW',
       'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM',
       'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'Crossing',
       'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling',
       'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
       'GKKicking', 'GKPositioning', 'GKReflexes', 'Release Clause'],
      dtype='object')
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18207 entries, 0 to 18206
Data columns (total 89 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                18207 non-null  int64  
 1   ID                        18207 non-null  int64  
 2   Name                      18207 non-null  object 
 3   Age                       18207 non-null  int64  
 4   Photo                     18207 non-null  object 
 5   Nationality               18207 non-null  object 
 6   Flag                      18207 non-null  object 
 7   Overall                   18207 non-null  int64  
 8   Potential                 18207 non-null  int64  
 9   Club                      17966 non-null  object 
 10  Club Logo                 18207 non-null  object 
 11  Value                     18207 non-null  object 
 12  Wage                      18207 non-null  object 
 13  Special                   18207 non-null  int64  
 14  Preferred Foot            18159 non-null  object 
 15  International Reputation  18159 non-null  float64
 16  Weak Foot                 18159 non-null  float64
 17  Skill Moves               18159 non-null  float64
 18  Work Rate                 18159 non-null  object 
 19  Body Type                 18159 non-null  object 
 20  Real Face                 18159 non-null  object 
 21  Position                  18147 non-null  object 
 22  Jersey Number             18147 non-null  float64
 23  Joined                    16654 non-null  object 
 24  Loaned From               1264 non-null   object 
 25  Contract Valid Until      17918 non-null  object 
 26  Height                    18159 non-null  object 
 27  Weight                    18159 non-null  object 
 28  LS                        16122 non-null  object 
 29  ST                        16122 non-null  object 
 30  RS                        16122 non-null  object 
 31  LW                        16122 non-null  object 
 32  LF                        16122 non-null  object 
 33  CF                        16122 non-null  object 
 34  RF                        16122 non-null  object 
 35  RW                        16122 non-null  object 
 36  LAM                       16122 non-null  object 
 37  CAM                       16122 non-null  object 
 38  RAM                       16122 non-null  object 
 39  LM                        16122 non-null  object 
 40  LCM                       16122 non-null  object 
 41  CM                        16122 non-null  object 
 42  RCM                       16122 non-null  object 
 43  RM                        16122 non-null  object 
 44  LWB                       16122 non-null  object 
 45  LDM                       16122 non-null  object 
 46  CDM                       16122 non-null  object 
 47  RDM                       16122 non-null  object 
 48  RWB                       16122 non-null  object 
 49  LB                        16122 non-null  object 
 50  LCB                       16122 non-null  object 
 51  CB                        16122 non-null  object 
 52  RCB                       16122 non-null  object 
 53  RB                        16122 non-null  object 
 54  Crossing                  18159 non-null  float64
 55  Finishing                 18159 non-null  float64
 56  HeadingAccuracy           18159 non-null  float64
 57  ShortPassing              18159 non-null  float64
 58  Volleys                   18159 non-null  float64
 59  Dribbling                 18159 non-null  float64
 60  Curve                     18159 non-null  float64
 61  FKAccuracy                18159 non-null  float64
 62  LongPassing               18159 non-null  float64
 63  BallControl               18159 non-null  float64
 64  Acceleration              18159 non-null  float64
 65  SprintSpeed               18159 non-null  float64
 66  Agility                   18159 non-null  float64
 67  Reactions                 18159 non-null  float64
 68  Balance                   18159 non-null  float64
 69  ShotPower                 18159 non-null  float64
 70  Jumping                   18159 non-null  float64
 71  Stamina                   18159 non-null  float64
 72  Strength                  18159 non-null  float64
 73  LongShots                 18159 non-null  float64
 74  Aggression                18159 non-null  float64
 75  Interceptions             18159 non-null  float64
 76  Positioning               18159 non-null  float64
 77  Vision                    18159 non-null  float64
 78  Penalties                 18159 non-null  float64
 79  Composure                 18159 non-null  float64
 80  Marking                   18159 non-null  float64
 81  StandingTackle            18159 non-null  float64
 82  SlidingTackle             18159 non-null  float64
 83  GKDiving                  18159 non-null  float64
 84  GKHandling                18159 non-null  float64
 85  GKKicking                 18159 non-null  float64
 86  GKPositioning             18159 non-null  float64
 87  GKReflexes                18159 non-null  float64
 88  Release Clause            16643 non-null  object 
dtypes: float64(38), int64(6), object(45)
memory usage: 12.4+ MB
In [4]:
df.describe()
Out[4]:
Unnamed: 0 ID Age Overall Potential Special International Reputation Weak Foot Skill Moves Jersey Number ... Penalties Composure Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes
count 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18159.000000 18159.000000 18159.000000 18147.000000 ... 18159.000000 18159.000000 18159.000000 18159.000000 18159.000000 18159.000000 18159.000000 18159.000000 18159.000000 18159.000000
mean 9103.000000 214298.338606 25.122206 66.238699 71.307299 1597.809908 1.113222 2.947299 2.361308 19.546096 ... 48.548598 58.648274 47.281623 47.697836 45.661435 16.616223 16.391596 16.232061 16.388898 16.710887
std 5256.052511 29965.244204 4.669943 6.908930 6.136496 272.586016 0.394031 0.660456 0.756164 15.947765 ... 15.704053 11.436133 19.904397 21.664004 21.289135 17.695349 16.906900 16.502864 17.034669 17.955119
min 0.000000 16.000000 16.000000 46.000000 48.000000 731.000000 1.000000 1.000000 1.000000 1.000000 ... 5.000000 3.000000 3.000000 2.000000 3.000000 1.000000 1.000000 1.000000 1.000000 1.000000
25% 4551.500000 200315.500000 21.000000 62.000000 67.000000 1457.000000 1.000000 3.000000 2.000000 8.000000 ... 39.000000 51.000000 30.000000 27.000000 24.000000 8.000000 8.000000 8.000000 8.000000 8.000000
50% 9103.000000 221759.000000 25.000000 66.000000 71.000000 1635.000000 1.000000 3.000000 2.000000 17.000000 ... 49.000000 60.000000 53.000000 55.000000 52.000000 11.000000 11.000000 11.000000 11.000000 11.000000
75% 13654.500000 236529.500000 28.000000 71.000000 75.000000 1787.000000 1.000000 3.000000 3.000000 26.000000 ... 60.000000 67.000000 64.000000 66.000000 64.000000 14.000000 14.000000 14.000000 14.000000 14.000000
max 18206.000000 246620.000000 45.000000 94.000000 95.000000 2346.000000 5.000000 5.000000 5.000000 99.000000 ... 92.000000 96.000000 94.000000 93.000000 91.000000 90.000000 92.000000 91.000000 90.000000 94.000000

8 rows × 44 columns

In [5]:
import missingno as msno
msno.matrix(df, labels=True, sort="descending");
msno.bar(df)
Out[5]:
<Axes: >
In [6]:
# import the required libraries 
import sweetviz as sv
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

print("SweetViz Version : {}".format(sv.__version__))
# analyzing the dataset
report = sv.analyze(df)
# show the report in a form of an HTML file
report.show_html('Report.html')
SweetViz Version : 2.2.1
                                             |          | [  0%]   00:00 -> (? left)
Report Report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
In [7]:
df.shape
Out[7]:
(18207, 89)
In [8]:
from dataprep.eda.missing import plot_missing
plot_missing(df)
  0%|          | 0/774 [00:00<?, ?it/s]
C:\Users\prose\anaconda3\Lib\site-packages\dask\core.py:121: RuntimeWarning: invalid value encountered in divide
  return func(*(_execute_task(a, cache) for a in args))
Out[8]:
DataPrep.EDA Report

Missing Statistics

Missing Cells76984
Missing Cells (%)4.8%
Missing Columns76
Missing Rows18207
Avg Missing Cells per Column864.99
Avg Missing Cells per Row4.23
'height': 500
Height of the plot
'width': 500
Width of the plot
'spectrum.bins': 20
Number of bins
'height': 500
Height of the plot
'width': 500
Width of the plot
'height': 500
Height of the plot
'width': 500
Width of the plot
'height': 500
Height of the plot
'width': 500
Width of the plot
In [21]:
# ignore warnings :
import warnings
warnings.filterwarnings('ignore')


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# for visualizations
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
plt.style.use('fivethirtyeight')

# to visualize missing data
import missingno as msno
In [22]:
# check for missing values
df.isnull().sum()
Out[22]:
Unnamed: 0           0
ID                   0
Name                 0
Age                  0
Photo                0
                  ... 
GKHandling          48
GKKicking           48
GKPositioning       48
GKReflexes          48
Release Clause    1564
Length: 89, dtype: int64
In [23]:
# fill missing values for continuous variables for proper data visualization
df['ShortPassing'].fillna(df['ShortPassing'].mean(), inplace = True)
df['Volleys'].fillna(df['Volleys'].mean(), inplace = True)
df['Dribbling'].fillna(df['Dribbling'].mean(), inplace = True)
df['Curve'].fillna(df['Curve'].mean(), inplace = True)
df['FKAccuracy'].fillna(df['FKAccuracy'], inplace = True)
df['LongPassing'].fillna(df['LongPassing'].mean(), inplace = True)
df['BallControl'].fillna(df['BallControl'].mean(), inplace = True)
df['HeadingAccuracy'].fillna(df['HeadingAccuracy'].mean(), inplace = True)
df['Finishing'].fillna(df['Finishing'].mean(), inplace = True)
df['Crossing'].fillna(df['Crossing'].mean(), inplace = True)
df['Weight'].fillna('200lbs', inplace = True)
df['Contract Valid Until'].fillna(2019, inplace = True)
df['Height'].fillna("5'11", inplace = True)
df['Loaned From'].fillna('None', inplace = True)
df['Joined'].fillna('Jul 1, 2018', inplace = True)
df['Jersey Number'].fillna(8, inplace = True)
df['Body Type'].fillna('Normal', inplace = True)
df['Position'].fillna('ST', inplace = True)
df['Club'].fillna('No Club', inplace = True)
df['Work Rate'].fillna('Medium/ Medium', inplace = True)
df['Skill Moves'].fillna(df['Skill Moves'].median(), inplace = True)
df['Weak Foot'].fillna(3, inplace = True)
df['Preferred Foot'].fillna('Right', inplace = True)
df['International Reputation'].fillna(1, inplace = True)
df['Wage'].fillna('€200K', inplace = True)
In [24]:
df.fillna(0, inplace = True)
In [29]:
df['Preferred Foot'].value_counts()
Out[29]:
Right    13996
Left      4211
Name: Preferred Foot, dtype: int64
In [35]:
#Soccer players have a certain active timeframe in their lives. The distribution of age is here
df['Age'].hist()
plt.title("Distribution of age of the players")
Out[35]:
Text(0.5, 1.0, 'Distribution of age of the players')
In [36]:
x = df['Age']
plt.figure(figsize = (12, 8))
plt.style.use('ggplot')
ax = sns.distplot(x, bins = 20, kde = True, color='g')
ax.set_xlabel(xlabel = 'Age of the Players', fontsize = 16)
ax.set_title(label = 'Histogram for Age distribution of Players', fontsize = 20)
plt.show()
In [38]:
df['International Reputation'].value_counts()
Out[38]:
1.0    16580
2.0     1261
3.0      309
4.0       51
5.0        6
Name: International Reputation, dtype: int64
In [40]:
labels = df['International Reputation'].value_counts().index
size = df['International Reputation'].value_counts()
explode = [0, 0.1, 0.1, 0, 0]
plt.pie(size, labels = labels, explode = explode, shadow = True, autopct='%1.1f%%',startangle = 90)
plt.title('Distribution of International Reputation of players', fontsize = 20)
plt.legend()
plt.show()
In [41]:
df['Work Rate'].value_counts()
Out[41]:
Medium/ Medium    9858
High/ Medium      3173
Medium/ High      1690
High/ High        1015
Medium/ Low        850
High/ Low          699
Low/ Medium        449
Low/ High          439
Low/ Low            34
Name: Work Rate, dtype: int64
In [42]:
fig, ax = plt.subplots(figsize=(12,8))
graph = sns.countplot(ax=ax,x=df['Work Rate'], data=df, palette = 'PuBuGn_d')
graph.set_title('Work Rate of the Players', fontsize = 20)
graph.set_xticklabels(graph.get_xticklabels(), rotation=30)
for p in graph.patches:
    height = p.get_height()
    graph.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center")
In [43]:
labels = df['Work Rate'].value_counts().index
size = df['Work Rate'].value_counts()
explode = [0,0,0.1,0,0.1,0,0,0,0]
plt.pie(size, labels = labels, explode = explode, shadow = True, autopct='%1.1f%%',startangle = 90)
plt.title('Distribution of Work Rate of players', fontsize = 20)
plt.legend()
plt.show()
In [44]:
fig, ax = plt.subplots(figsize=(12,8))
graph = sns.countplot(ax=ax,x=df['Work Rate'], data=df, hue='Preferred Foot', palette = 'PuBuGn_d')
graph.set_title('Work Rate of Players segregated by Preferred Foot'  , fontsize = 20)
graph.set_xticklabels(graph.get_xticklabels(), rotation=30)
for p in graph.patches:
    height = p.get_height()
    graph.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center")
In [45]:
df['Skill Moves'].value_counts()
Out[45]:
2.0    8613
3.0    6600
1.0    2026
4.0     917
5.0      51
Name: Skill Moves, dtype: int64
In [46]:
fig, ax = plt.subplots(figsize=(12,8))
graph = sns.countplot(ax=ax,x=df['Skill Moves'], data=df, palette = 'PuBuGn_d')
graph.set_title('Skill Moves of the Players', fontsize = 20)
graph.set_xticklabels(graph.get_xticklabels(), rotation=30)
for p in graph.patches:
    height = p.get_height()
    graph.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center")
In [47]:
fig, ax = plt.subplots(figsize=(12,8))
graph = sns.countplot(ax=ax,x=df['Skill Moves'], data=df, hue='Preferred Foot', palette = 'PuBuGn_d')
graph.set_title('Skill Moves of Players segregated by Preferred Foot'  , fontsize = 20)
graph.set_xticklabels(graph.get_xticklabels(), rotation=30)
for p in graph.patches:
    height = p.get_height()
    graph.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center")
In [48]:
#Special Score of the Players
x = df['Special']
plt.figure(figsize=(18,10))
ax = sns.distplot(x)
ax.set_xlabel(xlabel = "Player's Special Scores", fontsize = 16)
ax.set_ylabel(ylabel = 'Number of Players', fontsize = 16)
ax.set_title(label = 'Distribution of Players Special Scores', fontsize = 20)
plt.show()
In [53]:
# different positions acquired by the players 
df['Position'].value_counts()
Out[53]:
ST     2212
GK     2025
CB     1778
CM     1394
LB     1322
RB     1291
RM     1124
LM     1095
CAM     958
CDM     948
RCB     662
LCB     648
LCM     395
RCM     391
LW      381
RW      370
RDM     248
LDM     243
LS      207
RS      203
RWB      87
LWB      78
CF       74
LAM      21
RAM      21
RF       16
LF       15
Name: Position, dtype: int64
In [55]:
df['Nationality'].nunique()
Out[55]:
164
In [56]:
top_countries = df['Nationality'].value_counts().head(10)

top_countries
Out[56]:
England        1662
Germany        1198
Spain          1072
Argentina       937
France          914
Brazil          827
Italy           702
Colombia        618
Japan           478
Netherlands     453
Name: Nationality, dtype: int64
In [58]:
#A word cloud of nationalities will help understand which nationalities are dominating. 
#For doing that we need to join all the nationalities and then make a word cloud.
nationality = " ".join(n for n in df['Nationality'])
In [59]:
from wordcloud import WordCloud
plt.figure(figsize=(10, 10))
wc = WordCloud().generate(nationality)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()
In [60]:
fig, ax = plt.subplots(figsize=(12,8))
x = top_countries.values
y = top_countries.index
ax.barh(y, x, align='center', color='green')
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_xlabel('Number of Players')
ax.set_ylabel('Name of Countries', rotation=0)
ax.set_title('Top 10 Countries with most number of players')
plt.show()
In [61]:
top_countries_name = top_countries.index

top_countries_name
Out[61]:
Index(['England', 'Germany', 'Spain', 'Argentina', 'France', 'Brazil', 'Italy',
       'Colombia', 'Japan', 'Netherlands'],
      dtype='object')
In [62]:
#Age distribution from top countries
df_country_age = df.loc[df['Nationality'].isin(top_countries_name) & df['Age']]
plt.figure(1 , figsize = (12,6))
sns.boxplot(x = 'Nationality' , y = 'Age' , data = df_country_age, palette='rocket')
plt.title('Age Distribution in top countries')
plt.xticks(rotation = 50)
plt.show()
In [63]:
#Overall Rating
df_country_rating = df.loc[df['Nationality'].isin(top_countries_name) & df['Overall']]
plt.figure(1 , figsize = (12,6))
sns.barplot(x = 'Nationality' , y = 'Overall' , data = df_country_rating, palette='spring')
plt.title('Overall Rating Distribution of Players in top countries')
plt.xticks(rotation = 50)
plt.show()
In [64]:
#Potential Rating
df_country_rating = df.loc[df['Nationality'].isin(top_countries_name) & df['Potential']]
plt.figure(1 , figsize = (12,6))
sns.barplot(x = 'Nationality' , y = 'Potential' , data = df_country_rating, palette='PuBuGn_d')
plt.title('Potential Rating Distribution of Players in top countries')
plt.xticks(rotation = 50)
plt.show()
In [65]:
#Analyse players based on club
df['Club'].nunique()
Out[65]:
652
In [66]:
df['Club'].unique()
Out[66]:
array(['FC Barcelona', 'Juventus', 'Paris Saint-Germain',
       'Manchester United', 'Manchester City', 'Chelsea', 'Real Madrid',
       'Atlético Madrid', 'FC Bayern München', 'Tottenham Hotspur',
       'Liverpool', 'Napoli', 'Arsenal', 'Milan', 'Inter', 'Lazio',
       'Borussia Dortmund', 'Vissel Kobe', 'Olympique Lyonnais', 'Roma',
       'Valencia CF', 'Guangzhou Evergrande Taobao FC', 'FC Porto',
       'FC Schalke 04', 'Beşiktaş JK', 'LA Galaxy', 'Sporting CP',
       'Real Betis', 'Olympique de Marseille', 'RC Celta',
       'Bayer 04 Leverkusen', 'Real Sociedad', 'Villarreal CF',
       'Sevilla FC', 'SL Benfica', 'AS Saint-Étienne', 'AS Monaco',
       'Leicester City', 'Atalanta', 'Grêmio', 'Atlético Mineiro',
       'RB Leipzig', 'Ajax', 'Dalian YiFang FC', 'Everton',
       'West Ham United', '1. FC Köln', 'TSG 1899 Hoffenheim',
       'Shanghai SIPG FC', 'OGC Nice', 'Al Nassr',
       'Wolverhampton Wanderers', 'Borussia Mönchengladbach',
       'Hertha BSC', 'SV Werder Bremen', 'Cruzeiro',
       'Athletic Club de Bilbao', 'Torino', 'Medipol Başakşehir FK',
       'Beijing Sinobo Guoan FC', 'Crystal Palace', 'PFC CSKA Moscow',
       'VfL Wolfsburg', 'Shakhtar Donetsk', 'Toronto FC',
       'Lokomotiv Moscow', 'Sassuolo', 'New York City FC', 'Fluminense',
       'PSV', 'Levante UD', 'Fulham', 'Watford', 'Atlanta United',
       'Montpellier HSC', 'Galatasaray SK', 'Fenerbahçe SK', 'SD Eibar',
       'Los Angeles FC', 'Sampdoria', 'Al Hilal', 'VfB Stuttgart',
       'SC Braga', 'River Plate', 'Deportivo Alavés', 'No Club',
       'Eintracht Frankfurt', 'Girona FC', 'Guangzhou R&F; FC', 'Burnley',
       'Stoke City', 'Southampton', 'Tianjin Quanjian FC', 'Getafe CF',
       'Beijing Renhe FC', 'Montreal Impact', 'Chievo Verona', 'Genoa',
       'Portland Timbers', 'Tigres U.A.N.L.', 'RCD Espanyol',
       'Hebei China Fortune FC', 'Cagliari', 'Chicago Fire', 'DC United',
       'Sagan Tosu', 'Dynamo Kyiv', 'Santos', 'Internacional',
       'América FC (Minas Gerais)', 'Independiente', 'Boca Juniors',
       'Cruz Azul', '1. FSV Mainz 05', 'Bournemouth', 'Spartak Moscow',
       'Racing Club', 'FC Augsburg', 'Fiorentina', 'FC Nantes',
       'Feyenoord', 'Club Brugge KV', 'Brighton & Hove Albion', 'Al Ahli',
       'Jiangsu Suning FC', 'SC Freiburg', 'PAOK', 'Stade Rennais FC',
       'Trabzonspor', 'SPAL', 'Portimonense SC', 'Olympiacos CFP',
       'Club Atlético Huracán', 'Kasimpaşa SK', 'Newcastle United',
       'Frosinone', 'Querétaro', 'KRC Genk', 'Hannover 96',
       'Stade Malherbe Caen', 'Godoy Cruz', 'Toulouse Football Club',
       'RSC Anderlecht', 'Huddersfield Town', 'CD Tondela',
       'Seattle Sounders FC', 'Hamburger SV', 'FC Red Bull Salzburg',
       'Rio Ave FC', 'FC Girondins de Bordeaux', 'Melbourne Victory',
       'Parma', 'FC Basel 1893', 'Al Wehda', 'BSC Young Boys', 'KAA Gent',
       'Al Ittihad', 'Standard de Liège', 'Shanghai Greenland Shenhua FC',
       'Colo-Colo', 'Junior FC', 'West Bromwich Albion',
       'RC Strasbourg Alsace', 'Göztepe SK', 'Deportivo Cali',
       'Deportivo Toluca', 'Bologna', 'Nagoya Grampus', 'Amiens SC',
       'Changchun Yatai FC', 'Club Atlético Lanús', 'Botafogo',
       'Club América', 'Udinese', 'Real Valladolid CF', 'CD Leganés',
       'Club Atlético Banfield', 'Celtic', 'Vitória Guimarães',
       'FC København', 'UD Las Palmas', 'Deportivo de La Coruña',
       'Universidad Católica', 'San Lorenzo de Almagro', 'Rayo Vallecano',
       'Monterrey', 'Columbus Crew SC', 'MKE Ankaragücü',
       'Guizhou Hengfeng FC', 'Swansea City', 'Tianjin TEDA FC',
       'Chongqing Dangdai Lifan FC SWM Team', 'AEK Athens', 'Al Taawoun',
       'Melbourne City FC', 'En Avant de Guingamp',
       'Akhisar Belediyespor', 'Foggia', 'LOSC Lille', '1. FC Nürnberg',
       'Clube Sport Marítimo', 'Real Sporting de Gijón', 'BB Erzurumspor',
       'Shandong Luneng TaiShan FC', 'Club Atlético Colón', 'Bahia',
       'Once Caldas', 'FC Groningen', 'Angers SCO', 'Paraná',
       'Antalyaspor', 'Minnesota United FC', 'Club León', 'Empoli',
       'VVV-Venlo', 'Leeds United', 'Viktoria Plzeň', 'Alanyaspor',
       'Atlético Paranaense', 'Derby County', 'Kawasaki Frontale',
       'Cardiff City', 'Aston Villa', 'Guadalajara', 'Dijon FCO',
       'Santos Laguna', 'Málaga CF', 'Vitória', 'Çaykur Rizespor',
       'U.N.A.M.', 'Nottingham Forest', 'Royal Antwerp FC',
       'Club Tijuana', 'Sport Club do Recife', 'Real Salt Lake',
       'AZ Alkmaar', 'SK Slavia Praha', 'Willem II', 'Middlesbrough',
       'Dinamo Zagreb', 'Club Atlas', 'Granada CF', 'Sydney FC',
       'Sporting Kansas City', 'SV Zulte-Waregem', 'Philadelphia Union',
       'Real Oviedo', 'Pachuca', 'Boavista FC', 'Atiker Konyaspor',
       'Kaizer Chiefs', 'GD Chaves', 'Palermo', 'Atlético Nacional',
       'Puebla FC', 'Perth Glory', 'Panathinaikos FC', 'FC Sion',
       'Vitória de Setúbal', 'New York Red Bulls', 'Al Shabab',
       'Monarcas Morelia', 'Albacete BP', 'Rangers FC', 'Sparta Praha',
       'Legia Warszawa', 'Urawa Red Diamonds', 'Rosario Central',
       'Stade de Reims', 'ADO Den Haag', 'Chapecoense', 'FC Midtjylland',
       'San Jose Earthquakes', 'Belgrano de Córdoba', 'Brescia',
       'Kashima Antlers', 'CD Everton de Viña del Mar',
       'Fortuna Düsseldorf', 'SD Huesca', 'Preston North End',
       'Club Atlético Talleres', 'Benevento', 'Vitesse',
       'Gimnasia y Esgrima La Plata', 'Houston Dynamo', 'Club Necaxa',
       'Norwich City', 'Holstein Kiel', 'Ettifaq FC', 'Kayserispor',
       '1. FC Heidenheim 1846', 'Brentford', 'Yeni Malatyaspor',
       'Lobos BUAP', 'Bursaspor', 'Ceará Sporting Club',
       'Sheffield United', 'FC Ingolstadt 04', 'Estudiantes de La Plata',
       'AIK', 'Queens Park Rangers', 'Suwon Samsung Bluewings',
       'Heart of Midlothian', 'Reading', 'FC Dallas', 'Heracles Almelo',
       'Venezia FC', 'CD Lugo', 'Henan Jianye FC', 'Orlando City SC',
       'CA Osasuna', 'NAC Breda', 'Livorno', 'Universidad de Chile',
       'Brøndby IF', 'Aberdeen', 'Defensa y Justicia', 'Atlético Tucumán',
       'Blackburn Rovers', 'SV Darmstadt 98', 'Moreirense FC',
       'Sanfrecce Hiroshima', 'CD Numancia', 'KV Oostende', 'FC Utrecht',
       'Vancouver Whitecaps FC', 'Odense Boldklub', 'SC Heerenveen',
       'Racing Club de Lens', 'Independiente Santa Fe',
       'Sporting de Charleroi', 'Millonarios FC', 'Sheffield Wednesday',
       'Perugia', 'Daegu FC', 'Vélez Sarsfield',
       'Grasshopper Club Zürich', 'Sivasspor', 'Nîmes Olympique',
       'Rosenborg BK', 'SK Sturm Graz', 'FC Metz',
       'CD Universidad de Concepción', 'Hellas Verona', 'Brisbane Roar',
       'CD Feirense', 'Hull City', 'Waasland-Beveren', 'Neuchâtel Xamax',
       'Real Zaragoza', 'CD Aves', 'Millwall', 'Unión de Santa Fe',
       'KAS Eupen', 'Cádiz CF', 'FC Tokyo', 'CD Tenerife',
       '1. FC Union Berlin', 'Al Fayha', 'AJ Auxerre',
       'Patriotas Boyacá FC', 'Molde FK', 'Bristol City', 'CD Nacional',
       'Sporting Lokeren', 'FC St. Pauli', 'Deportes Iquique',
       'Al Qadisiyah', 'Atlético Bucaramanga', 'Club Atlético Tigre',
       'FK Austria Wien', 'Patronato', 'Malmö FF', 'Kashiwa Reysol',
       'US Cremonese', 'VfL Bochum 1848', 'SK Rapid Wien',
       'KSV Cercle Brugge', 'Rionegro Águilas', 'Gimnàstic de Tarragona',
       'Lecce', 'Santa Clara', 'BK Häcken', 'New England Revolution',
       'Orlando Pirates', 'Atlético Huila', 'Western Sydney Wanderers',
       'Kalmar FF', 'Independiente Medellín', 'Fortuna Sittard',
       'Lech Poznań', 'Djurgårdens IF', 'CF Reus Deportiu', 'SK Brann',
       'Ulsan Hyundai FC', 'Sint-Truidense VV', 'Carpi', 'Al Fateh',
       'Royal Excel Mouscron', 'AC Ajaccio', 'PEC Zwolle', 'Sunderland',
       'Club Atlético Aldosivi', 'US Salernitana 1919', 'FC Lorient',
       'Argentinos Juniors', 'AD Alcorcón', 'Crotone', 'Excelsior',
       'KV Kortrijk', 'IFK Norrköping', 'Adelaide United',
       'FC St. Gallen', 'Tiburones Rojos de Veracruz', 'CD Palestino',
       'Jeju United FC', 'Deportes Tolima', 'Jeonbuk Hyundai Motors',
       'Birmingham City', 'América de Cali', 'La Equidad', 'Spezia',
       'Aalborg BK', 'Le Havre AC', 'Górnik Zabrze',
       'Central Coast Mariners', 'Wigan Athletic',
       'Jagiellonia Białystok', 'Cittadella', 'Hibernian', 'FC Lugano',
       'San Martín de San Juan', 'Strømsgodset IF', 'Júbilo Iwata',
       "Newell's Old Boys", 'Al Faisaly', 'Colorado Rapids',
       'IF Elfsborg', 'SV Sandhausen', 'Al Batin', 'Stade Brestois 29',
       'UD Almería', 'Gyeongnam FC', 'Yokohama F. Marinos', 'Kilmarnock',
       'Pescara', 'Newcastle Jets', 'Córdoba CF', 'RCD Mallorca',
       'Hammarby IF', 'Cerezo Osaka', 'KFC Uerdingen 05',
       'Shimizu S-Pulse', 'MSV Duisburg', 'Os Belenenses',
       'DSC Arminia Bielefeld', 'Ipswich Town', 'FC Seoul',
       'Lechia Gdańsk', 'Gamba Osaka', 'CF Rayo Majadahonda', 'LASK Linz',
       'Bolton Wanderers', 'Al Raed', 'Extremadura UD', 'SC Paderborn 07',
       'Wellington Phoenix', 'Unión Española', 'Alianza Petrolera',
       'Cracovia', 'Gangwon FC', 'Elche CF', 'ESTAC Troyes', 'AS Béziers',
       'La Berrichonne de Châteauroux', 'Clermont Foot 63',
       '1. FC Magdeburg', 'Pohang Steelers', 'Örebro SK', 'Arka Gdynia',
       'SG Dynamo Dresden', 'SpVgg Greuther Fürth', 'CD Huachipato',
       'Wisła Kraków', 'Stabæk Fotball', 'Eintracht Braunschweig',
       'Valenciennes FC', 'FC Thun', 'San Luis de Quillota',
       ' SSV Jahn Regensburg', 'Cosenza', 'FC Nordsjælland',
       'FC Erzgebirge Aue', 'Jeonnam Dragons', 'Wolfsberger AC',
       'Chamois Niortais Football Club', 'Club Deportes Temuco',
       'AS Nancy Lorraine', 'Red Star FC', 'Al Hazem', 'Pogoń Szczecin',
       'Charlton Athletic', 'Grenoble Foot 38', 'FC Hansa Rostock',
       'San Martin de Tucumán', 'Incheon United FC', 'Śląsk Wrocław',
       'GFC Ajaccio', '1. FC Kaiserslautern', 'Deportivo Pasto',
       'Lincoln City', 'Motherwell', 'Rotherham United', 'Burton Albion',
       'Wisła Płock', 'FC Wacker Innsbruck', 'Peterborough United',
       'Ascoli', 'FC Zürich', 'Fleetwood Town', 'Padova',
       'FC Sochaux-Montbéliard', 'SV Wehen Wiesbaden', 'Unión La Calera',
       'Scunthorpe United', "CD O'Higgins", 'CD Antofagasta',
       'Plymouth Argyle', 'Aarhus GF', 'Lillestrøm SK', 'Karlsruher SC',
       'GIF Sundsvall', 'FC Emmen', 'Barnsley', 'Audax Italiano',
       'V-Varen Nagasaki', 'Paris FC', 'SpVgg Unterhaching', 'Hobro IK',
       'De Graafschap', 'Hokkaido Consadole Sapporo', 'Tromsø IL',
       'FC Luzern', 'FK Haugesund', 'Zagłębie Lubin', 'VfR Aalen',
       'Dundalk', 'Oxford United', 'Piast Gliwice', 'Ohod Club',
       'Östersunds FK', 'Vegalta Sendai', 'Crawley Town',
       'FC Admira Wacker Mödling', 'Vålerenga Fotball', 'Dundee FC',
       'Portsmouth', 'Envigado FC', 'Miedź Legnica', 'Odds BK',
       'SC Fortuna Köln', 'US Orléans Loiret Football', 'Sarpsborg 08 FF',
       'Jaguares de Córdoba', 'Bradford City', 'Accrington Stanley',
       'St. Johnstone FC', 'Boyacá Chicó FC', 'Luton Town',
       'SV Mattersburg', 'Kristiansund BK', 'Sangju Sangmu FC',
       'Rochdale', 'Walsall', 'Korona Kielce', 'Shonan Bellmare',
       'FC Würzburger Kickers', 'FSV Zwickau', 'St. Mirren', 'AC Horsens',
       'Esbjerg fB', 'HJK Helsinki', 'Southend United', 'Bristol Rovers',
       'Hamilton Academical FC', 'TSV 1860 München', 'Curicó Unido',
       'SCR Altach', 'Ranheim Fotball', 'Stevenage',
       'SG Sonnenhof Großaspach', 'Oldham Athletic', 'Milton Keynes Dons',
       'FK Bodø/Glimt', 'SC Preußen Münster', 'Wycombe Wanderers',
       'Vejle Boldklub', 'Bury', 'Randers FC', 'VfL Osnabrück',
       'SønderjyskE', 'IFK Göteborg', 'Mansfield Town', 'Coventry City',
       'Waterford FC', 'Shrewsbury', 'IK Start', 'Gillingham',
       'FC Energie Cottbus', 'FC Carl Zeiss Jena', 'Hallescher FC',
       'SV Meppen', 'AFC Wimbledon', 'Blackpool', 'Doncaster Rovers',
       'Sandefjord Fotball', 'VfL Sportfreunde Lotte', 'Cheltenham Town',
       'IK Sirius', 'Vendsyssel FF', 'Swindon Town', 'Notts County',
       'SKN St. Pölten', 'Exeter City', 'Northampton Town',
       'Shamrock Rovers', 'Colchester United', 'Livingston FC',
       'TSV Hartberg', 'Tranmere Rovers', 'Cambridge United',
       'Grimsby Town', 'Port Vale', 'Itagüí Leones FC',
       'Forest Green Rovers', 'Dalkurd FF', 'Zagłębie Sosnowiec',
       'Carlisle United', 'Trelleborgs FF', "St. Patrick's Athletic",
       'Morecambe', 'Cork City', 'IF Brommapojkarna', 'Crewe Alexandra',
       'Yeovil Town', 'Bohemian FC', 'Macclesfield Town',
       'Newport County', 'Sligo Rovers', 'Derry City', 'Limerick FC',
       'Bray Wanderers'], dtype=object)
In [67]:
clubs = ['FC Barcelona','Real Madrid','Juventus','Liverpool','Manchester United',
         'Chelsea','Arsenal','Paris Saint-Germain' ,'FC Bayern München','Manchester City']
In [68]:
#Age distribution in famous clubs
df_club_age = df.loc[df['Club'].isin(clubs) & df['Age']]
plt.figure(1 , figsize = (12,6))
sns.boxplot(x = 'Club', y = 'Age' , data = df_club_age, palette='spring')
plt.title('Age Distribution in famous clubs')
plt.xticks(rotation = 50)
plt.show()
In [69]:
#Overall Rating in famous clubs
df_club_rating = df.loc[df['Club'].isin(clubs) & df['Overall']]
plt.figure(1 , figsize = (12,6))
sns.boxplot(x = 'Club' , y = 'Overall' , data = df_club_rating, palette='PuBuGn_d')
plt.title('Overall Rating Distribution in famous clubs')
plt.xticks(rotation = 50)
plt.show()
In [70]:
#The Best Clubs with Players Overall Rating
best_dict = {}
for club in df['Club'].unique():
    overall_rating = df['Overall'][df['Club'] == club].sum()
    best_dict[club] = overall_rating
best_club = pd.DataFrame.from_dict(best_dict, orient='index', columns = ['overall'])
best_club['club'] = best_club.index
best_club = best_club.sort_values(by = 'overall', ascending =  False)

plt.figure(1 , figsize = (15 , 6))
sns.barplot(x ='club',y ='overall',data = best_club.head(10),palette='Reds')  
plt.xticks(rotation = 70)
plt.xlabel("Club")
plt.ylabel('Sum of Overall Rating of players in club')
plt.title('Clubs with best Players (sum of overall ratings of players per club)')
plt.ylim(2450 , 2600)
plt.show()
In [71]:
#Profiling top players
#The Best Players
df_best_players = pd.DataFrame.copy(df.sort_values(by ='Overall',ascending = False ).head(10))
plt.figure(1,figsize = (12,6))
sns.barplot(x ='Name' , y = 'Overall' , data = df_best_players, palette='PuBuGn_d')
plt.ylim(85,95)
plt.show()
In [72]:
#The Highest Earners
def normalizing_wage(x):
    if '€' in str(x) and 'M' in str(x):
        c = str(x).replace('€' , '')
        c = str(c).replace('M' , '')
        c = float(c) * 1000000

    else:
        c = str(x).replace('€' , '')
        c = str(c).replace('K' , '')
        c = float(c) * 1000

    return c

df['Normalized_Wage'] = df['Wage'].apply(lambda x : normalizing_wage(x))
df.sort_values(by = 'Normalized_Wage' , ascending = False)[['Name','Club','Nationality','Overall',
'Age','Normalized_Wage','Wage]].head(5)
Out[72]:
Name Club Nationality Overall Age Normalized_Wage Wage
0 L. Messi FC Barcelona Argentina 94 31 565000.0 €565
7 L. Suárez FC Barcelona Uruguay 91 31 455000.0 €455
6 L. Modrić Real Madrid Croatia 91 32 420000.0 €420
1 Cristiano Ronaldo Juventus Portugal 94 33 405000.0 €405
8 Sergio Ramos Real Madrid Spain 91 32 380000.0 €380
In [73]:
#The Eldest Players
df.sort_values(by = 'Age' , ascending = False)[['Name','Club','Nationality','Overall', 'Age' ]].head()
Out[73]:
Name Club Nationality Overall Age
4741 O. Pérez Pachuca Mexico 71 45
18183 K. Pilkington Cambridge United England 48 44
17726 T. Warner Accrington Stanley Trinidad & Tobago 53 44
10545 S. Narazaki Nagoya Grampus Japan 65 42
7225 C. Muñoz CD Universidad de Concepción Argentina 68 41
In [74]:
#The Youngest Players
df.sort_values(by = 'Age' , ascending = True)[['Name','Club','Nationality','Overall', 'Age' ]].head()
Out[74]:
Name Club Nationality Overall Age
18206 G. Nugent Tranmere Rovers England 46 16
17743 J. Olstad Sarpsborg 08 FF Norway 52 16
13293 H. Massengo AS Monaco France 62 16
16081 J. Italiano Perth Glory Australia 58 16
18166 N. Ayéva Örebro SK Sweden 48 16
In [75]:
#The Best Freekick Takers
df.sort_values(by = 'FKAccuracy' , ascending = False)[['Name','Club','Nationality','Age','FKAccuracy']].head()
Out[75]:
Name Club Nationality Age FKAccuracy
0 L. Messi FC Barcelona Argentina 31 94.0
293 S. Giovinco Toronto FC Italy 31 93.0
72 M. Pjanić Juventus Bosnia Herzegovina 28 92.0
1113 E. Bardhi Levante UD FYR Macedonia 22 91.0
90 Parejo Valencia CF Spain 29 90.0
In [76]:
#The Best Penalty Kick Taker
df.sort_values(by = 'Penalties' , ascending = False)[['Name','Club','Nationality','Age','Penalties']].head()
Out[76]:
Name Club Nationality Age Penalties
206 M. Balotelli OGC Nice Italy 27 92.0
118 Fabinho Liverpool Brazil 24 91.0
16 H. Kane Tottenham Hotspur England 24 90.0
823 R. Jiménez Wolverhampton Wanderers Mexico 27 90.0
945 L. Baines Everton England 33 90.0
In [77]:
#Best players with the Ball Control
df.sort_values(by = 'BallControl' , ascending = False)[['Name','Club','Nationality','Overall', 'Age','BallControl']].head()
Out[77]:
Name Club Nationality Overall Age BallControl
0 L. Messi FC Barcelona Argentina 94 31 96.0
2 Neymar Jr Paris Saint-Germain Brazil 92 26 95.0
30 Isco Real Madrid Spain 88 26 95.0
13 David Silva Manchester City Spain 90 32 94.0
5 E. Hazard Chelsea Belgium 91 27 94.0
In [78]:
#Fastest Players
df.sort_values(by = 'SprintSpeed' , ascending  False)[['Name','Club','Nationality','Overall', 'Age','SprintSpeed']].head()
Out[78]:
Name Club Nationality Overall Age SprintSpeed
55 L. Sané Manchester City Germany 86 22 96.0
25 K. Mbappé Paris Saint-Germain France 88 19 96.0
1968 Adama Wolverhampton Wanderers Spain 75 22 96.0
36 G. Bale Real Madrid Wales 88 28 95.0
10928 Maicon Livorno Brazil 65 25 95.0
In [79]:
#The Best Dribbler
df.sort_values(by = 'Dribbling' , ascending = False)[['Name','Club','Nationality','Overall', 'Age','Dribbling']].head()
Out[79]:
Name Club Nationality Overall Age Dribbling
0 L. Messi FC Barcelona Argentina 94 31 97.0
2 Neymar Jr Paris Saint-Germain Brazil 92 26 96.0
5 E. Hazard Chelsea Belgium 91 27 95.0
30 Isco Real Madrid Spain 88 26 94.0
94 Y. Brahimi FC Porto Algeria 85 28 93.0
In [80]:
#The Best Finisher
df.sort_values(by = 'Finishing' , ascending = False)[['Name','Club','Nationality','Overall', 'Age','Finishing']].head()
Out[80]:
Name Club Nationality Overall Age Finishing
0 L. Messi FC Barcelona Argentina 94 31 95.0
16 H. Kane Tottenham Hotspur England 89 24 94.0
1 Cristiano Ronaldo Juventus Portugal 94 33 94.0
7 L. Suárez FC Barcelona Uruguay 91 31 93.0
23 S. Agüero Manchester City Argentina 89 30 93.0
In [81]:
#Distribution of weak foot
df['Weak Foot'].value_counts()
Out[81]:
3.0    11397
2.0     3761
4.0     2662
5.0      229
1.0      158
Name: Weak Foot, dtype: int64
In [82]:
labels = df['Weak Foot'].value_counts().index
size = df['Weak Foot'].value_counts()
colors=['cyan','pink','orange','lightgreen','yellow']
explode = [0, 0.1, 0.1, 0, 0]
plt.pie(size, labels = labels, colors = colors, explode = explode, shadow = True, autopct='%1.1f%%',startangle = 90)
plt.title('Distribution of Weak Foot among players', fontsize = 20)
plt.legend()
plt.show()
In [83]:
df['Preferred Foot'].value_counts()/len(df)
Out[83]:
Right    0.768715
Left     0.231285
Name: Preferred Foot, dtype: float64
In [84]:
labels = df['Preferred Foot'].value_counts().index
size = df['Preferred Foot'].value_counts()
colors=['cyan','pink']
plt.pie(size, labels = labels, colors = colors, shadow = True, autopct='%1.1f%%',startangle = 90)
plt.title('Distribution of Preferred Foot among players', fontsize = 20)
plt.legend()
plt.show()
In [89]:
#Segregation of Indian Players
def country(x):
    return df[df['Nationality'] == x].head()


# prepare dataset for Indian players
country('India')
Out[89]:
Unnamed: 0 ID Name Age Photo Nationality Flag Overall Potential Club ... Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause Normalized_Wage
8605 8605 190939 S. Chhetri 33 https://cdn.sofifa.org/players/4/19/190939.png India https://cdn.sofifa.org/flags/159.png 67 67 No Club ... 30.0 24.0 44.0 10.0 7.0 11.0 9.0 10.0 0 0.0
10011 10011 223763 S. Jhingan 24 https://cdn.sofifa.org/players/4/19/223763.png India https://cdn.sofifa.org/flags/159.png 65 71 No Club ... 64.0 61.0 60.0 13.0 11.0 7.0 11.0 12.0 0 0.0
12598 12598 217225 J. Lalpekhlua 27 https://cdn.sofifa.org/players/4/19/217225.png India https://cdn.sofifa.org/flags/159.png 63 64 No Club ... 28.0 31.0 29.0 13.0 11.0 10.0 10.0 11.0 0 0.0
12811 12811 225213 G. Singh Sandhu 26 https://cdn.sofifa.org/players/4/19/225213.png India https://cdn.sofifa.org/flags/159.png 63 68 No Club ... 19.0 15.0 11.0 63.0 59.0 59.0 62.0 64.0 0 0.0
13508 13508 238205 A. Edathodika 31 https://cdn.sofifa.org/players/4/19/238205.png India https://cdn.sofifa.org/flags/159.png 62 62 No Club ... 67.0 62.0 68.0 14.0 15.0 14.0 11.0 7.0 0 0.0

5 rows × 90 columns

In [93]:
sns.lineplot(data=df, x="Age", y="Overall")
Out[93]:
<Axes: xlabel='Age', ylabel='Overall'>
In [96]:
sns.lineplot(data=df, x="Age", y="Overall")
Out[96]:
<Axes: xlabel='Age', ylabel='Overall'>
In [99]:
df['Nationality'].value_counts()[0:10]
Out[99]:
England        1662
Germany        1198
Spain          1072
Argentina       937
France          914
Brazil          827
Italy           702
Colombia        618
Japan           478
Netherlands     453
Name: Nationality, dtype: int64
In [105]:
import matplotlib.pyplot as plt

# Create a figure
plt.figure(figsize=(8, 5))

# Select the top 5 Nationalities and create a bar plot
top_nationalities = df['Nationality'].value_counts().head(5)
plt.bar(top_nationalities.index, top_nationalities, color="g")

# Show the plot
plt.show()
In [108]:
#Finding Out Which Player Gets The Highest Wages.
player_salary = df[['Name' , 'Wage']]
player_salary.head()
Out[108]:
Name Wage
0 L. Messi €565
1 Cristiano Ronaldo €405
2 Neymar Jr €290
3 De Gea €260
4 K. De Bruyne €355
In [122]:
import matplotlib.pyplot as plt

# Create a figure with a specified size
plt.figure(figsize=(8, 5))

# Define the data you want to plot as regular Python lists
players = player_salary['Name'][0:5]
wages = player_salary['Wage'][0:5]

# Create a bar chart
plt.bar(players, wages, color="red")

# Display the chart
plt.show()
In [128]:
#Germany
Germany = df[df['Nationality'] == 'Germany']
Germany.head()
Out[128]:
Unnamed: 0 ID Name Age Photo Nationality Flag Overall Potential Club ... Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause Normalized_Wage
11 11 182521 T. Kroos 28 https://cdn.sofifa.org/players/4/19/182521.png Germany https://cdn.sofifa.org/flags/21.png 90 90 Real Madrid ... 72.0 79.0 69.0 10.0 11.0 13.0 7.0 10.0 €156.8 355000.0
18 18 192448 M. ter Stegen 26 https://cdn.sofifa.org/players/4/19/192448.png Germany https://cdn.sofifa.org/flags/21.png 89 92 FC Barcelona ... 25.0 13.0 10.0 87.0 85.0 88.0 85.0 90.0 €123.3 240000.0
22 22 167495 M. Neuer 32 https://cdn.sofifa.org/players/4/19/167495.png Germany https://cdn.sofifa.org/flags/21.png 89 89 FC Bayern München ... 17.0 10.0 11.0 90.0 86.0 91.0 87.0 87.0 €62.7 130000.0
34 34 178603 M. Hummels 29 https://cdn.sofifa.org/players/4/19/178603.png Germany https://cdn.sofifa.org/flags/21.png 88 88 FC Bayern München ... 88.0 90.0 88.0 15.0 6.0 10.0 5.0 6.0 €75.9 160000.0
55 55 222492 L. Sané 22 https://cdn.sofifa.org/players/4/19/222492.png Germany https://cdn.sofifa.org/flags/21.png 86 92 Manchester City ... 36.0 32.0 35.0 8.0 12.0 9.0 9.0 14.0 €125.1 195000.0

5 rows × 90 columns

In [132]:
#Finding  Out Who Is The Tallest German Player Or The Player Belongs To German Nationality
Germany.sort_values(by=[ 'Height'], ascending=False).head()
Out[132]:
Unnamed: 0 ID Name Age Photo Nationality Flag Overall Potential Club ... Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause Normalized_Wage
17786 17786 240218 A. Weidinger 21 https://cdn.sofifa.org/players/4/19/240218.png Germany https://cdn.sofifa.org/flags/21.png 52 60 SSV Jahn Regensburg ... 14.0 13.0 11.0 51.0 54.0 56.0 56.0 49.0 €105K 1000.0
7785 7785 236831 A. Seydel 22 https://cdn.sofifa.org/players/4/19/236831.png Germany https://cdn.sofifa.org/flags/21.png 67 76 Holstein Kiel ... 25.0 28.0 22.0 12.0 9.0 14.0 11.0 9.0 0 9000.0
13520 13520 239746 L. Watkowiak 22 https://cdn.sofifa.org/players/4/19/239746.png Germany https://cdn.sofifa.org/flags/21.png 62 68 SV Wehen Wiesbaden ... 7.0 13.0 13.0 65.0 59.0 64.0 60.0 66.0 €495K 1000.0
4542 4542 158657 T. Kessler 32 https://cdn.sofifa.org/players/4/19/158657.png Germany https://cdn.sofifa.org/flags/21.png 71 71 1. FC Köln ... 12.0 14.0 15.0 72.0 69.0 67.0 70.0 71.0 €2 10000.0
1426 1426 199833 L. Unnerstall 27 https://cdn.sofifa.org/players/4/19/199833.png Germany https://cdn.sofifa.org/flags/21.png 76 78 VVV-Venlo ... 11.0 17.0 15.0 76.0 75.0 72.0 74.0 77.0 0 12000.0

5 rows × 90 columns

In [134]:
#Finding Out The German Player Who Has The Highest Weight
Germany.sort_values(by=[ 'Weight'], ascending=False).head()
Out[134]:
Unnamed: 0 ID Name Age Photo Nationality Flag Overall Potential Club ... Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause Normalized_Wage
13520 13520 239746 L. Watkowiak 22 https://cdn.sofifa.org/players/4/19/239746.png Germany https://cdn.sofifa.org/flags/21.png 62 68 SV Wehen Wiesbaden ... 7.0 13.0 13.0 65.0 59.0 64.0 60.0 66.0 €495K 1000.0
1426 1426 199833 L. Unnerstall 27 https://cdn.sofifa.org/players/4/19/199833.png Germany https://cdn.sofifa.org/flags/21.png 76 78 VVV-Venlo ... 11.0 17.0 15.0 76.0 75.0 72.0 74.0 77.0 0 12000.0
210 210 179783 R. Fährmann 29 https://cdn.sofifa.org/players/4/19/179783.png Germany https://cdn.sofifa.org/flags/21.png 83 84 FC Schalke 04 ... 10.0 12.0 10.0 83.0 81.0 52.0 82.0 87.0 €35.5 38000.0
165 165 213331 J. Tah 22 https://cdn.sofifa.org/players/4/19/213331.png Germany https://cdn.sofifa.org/flags/21.png 83 88 Bayer 04 Leverkusen ... 80.0 88.0 84.0 11.0 8.0 7.0 9.0 14.0 €52.4 67000.0
1225 1225 200212 M. Esser 30 https://cdn.sofifa.org/players/4/19/200212.png Germany https://cdn.sofifa.org/flags/21.png 76 76 Hannover 96 ... 20.0 15.0 16.0 76.0 74.0 69.0 75.0 78.0 €8.9 24000.0

5 rows × 90 columns

In [135]:
#Finding  Out Who Are The Top Most Earning German Players.
Germany.sort_values(by=[ 'Wage'], ascending=False).head()
Out[135]:
Unnamed: 0 ID Name Age Photo Nationality Flag Overall Potential Club ... Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause Normalized_Wage
82 82 212622 J. Kimmich 23 https://cdn.sofifa.org/players/4/19/212622.png Germany https://cdn.sofifa.org/flags/21.png 85 88 FC Bayern München ... 75.0 81.0 79.0 8.0 15.0 7.0 15.0 15.0 €69.9 92000.0
184 184 202166 J. Draxler 24 https://cdn.sofifa.org/players/4/19/202166.png Germany https://cdn.sofifa.org/flags/21.png 83 86 Paris Saint-Germain ... 39.0 64.0 44.0 11.0 13.0 5.0 13.0 8.0 €62.6 91000.0
448 448 208333 E. Can 24 https://cdn.sofifa.org/players/4/19/208333.png Germany https://cdn.sofifa.org/flags/21.png 80 85 Juventus ... 82.0 81.0 80.0 14.0 8.0 8.0 13.0 11.0 €33.6 91000.0
5613 5613 238072 E. Löwen 21 https://cdn.sofifa.org/players/4/19/238072.png Germany https://cdn.sofifa.org/flags/21.png 70 80 1. FC Nürnberg ... 61.0 66.0 60.0 5.0 10.0 5.0 12.0 13.0 €5 9000.0
8607 8607 158172 M. Parensen 32 https://cdn.sofifa.org/players/4/19/158172.png Germany https://cdn.sofifa.org/flags/21.png 67 67 1. FC Union Berlin ... 66.0 63.0 65.0 7.0 15.0 7.0 9.0 9.0 €680K 9000.0

5 rows × 90 columns

In [136]:
#Columns with NaN null values
df.columns[df.isna().any()]
Out[136]:
Index([], dtype='object')
In [138]:
#checking for NaN values 
df.isna().any().any()
Out[138]:
False
In [139]:
#Players who are goalkeepers
strickers_df=df[['Name','Position','Nationality','Club','Overall','Potential']]
strickers_df=strickers_df[strickers_df['Position'].str.contains(pat='GK')]
strickers_df.sort_values('Overall',ascending=False).head(20)
Out[139]:
Name Position Nationality Club Overall Potential
3 De Gea GK Spain Manchester United 91 93
9 J. Oblak GK Slovenia Atlético Madrid 90 93
18 M. ter Stegen GK Germany FC Barcelona 89 92
19 T. Courtois GK Belgium Real Madrid 89 90
22 M. Neuer GK Germany FC Bayern München 89 89
37 H. Lloris GK France Tottenham Hotspur 88 88
40 S. Handanovič GK Slovenia Inter 88 88
41 G. Buffon GK Italy Paris Saint-Germain 88 88
46 K. Navas GK Costa Rica Real Madrid 87 87
57 Ederson GK Brazil Manchester City 86 90
81 Alisson GK Brazil Liverpool 85 90
92 W. Szczęsny GK Poland Juventus 85 87
133 L. Hrádecký GK Finland Bayer 04 Leverkusen 84 84
147 S. Ruffier GK France AS Saint-Étienne 84 84
141 Sergio Asenjo GK Spain Villarreal CF 84 85
149 K. Schmeichel GK Denmark Leicester City 84 84
131 B. Leno GK Germany Arsenal 84 87
126 A. Lopes GK Portugal Olympique Lyonnais 84 86
128 M. Perin GK Italy Juventus 84 89
210 R. Fährmann GK Germany FC Schalke 04 83 84
In [140]:
#Let's check how many players are here only from England
eng_players_df=df[df.Nationality=='England']
eng_players_df
Out[140]:
Unnamed: 0 ID Name Age Photo Nationality Flag Overall Potential Club ... Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause Normalized_Wage
16 16 202126 H. Kane 24 https://cdn.sofifa.org/players/4/19/202126.png England https://cdn.sofifa.org/flags/14.png 89 91 Tottenham Hotspur ... 56.0 36.0 38.0 8.0 10.0 11.0 14.0 11.0 €160.7 205000.0
60 60 202652 R. Sterling 23 https://cdn.sofifa.org/players/4/19/202652.png England https://cdn.sofifa.org/flags/14.png 86 89 Manchester City ... 47.0 58.0 54.0 15.0 12.0 12.0 15.0 9.0 €108.8 195000.0
117 117 211117 D. Alli 22 https://cdn.sofifa.org/players/4/19/211117.png England https://cdn.sofifa.org/flags/14.png 84 90 Tottenham Hotspur ... 70.0 70.0 63.0 7.0 6.0 9.0 11.0 8.0 €87.1 115000.0
135 135 188377 K. Walker 28 https://cdn.sofifa.org/players/4/19/188377.png England https://cdn.sofifa.org/flags/14.png 84 84 Manchester City ... 78.0 84.0 83.0 12.0 6.0 16.0 15.0 8.0 €45.3 165000.0
180 180 204935 J. Pickford 24 https://cdn.sofifa.org/players/4/19/204935.png England https://cdn.sofifa.org/flags/14.png 83 88 Everton ... 16.0 20.0 12.0 83.0 78.0 88.0 81.0 86.0 €49.4 78000.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18198 18198 242844 J. Livesey 18 https://cdn.sofifa.org/players/4/19/242844.png England https://cdn.sofifa.org/flags/14.png 47 70 Burton Albion ... 15.0 11.0 13.0 46.0 52.0 58.0 42.0 48.0 €165K 1000.0
18202 18202 238813 J. Lundstram 19 https://cdn.sofifa.org/players/4/19/238813.png England https://cdn.sofifa.org/flags/14.png 47 65 Crewe Alexandra ... 40.0 48.0 47.0 10.0 13.0 7.0 8.0 9.0 €143K 1000.0
18204 18204 241638 B. Worman 16 https://cdn.sofifa.org/players/4/19/241638.png England https://cdn.sofifa.org/flags/14.png 47 67 Cambridge United ... 32.0 13.0 11.0 6.0 5.0 10.0 6.0 13.0 €165K 1000.0
18205 18205 246268 D. Walker-Rice 17 https://cdn.sofifa.org/players/4/19/246268.png England https://cdn.sofifa.org/flags/14.png 47 66 Tranmere Rovers ... 20.0 25.0 27.0 14.0 6.0 14.0 8.0 9.0 €143K 1000.0
18206 18206 246269 G. Nugent 16 https://cdn.sofifa.org/players/4/19/246269.png England https://cdn.sofifa.org/flags/14.png 46 66 Tranmere Rovers ... 40.0 43.0 50.0 10.0 15.0 9.0 12.0 9.0 €165K 1000.0

1662 rows × 90 columns

In [141]:
print('Total number of players from England : {}'.format(eng_players_df.shape[0]))
Total number of players from England : 1662
In [143]:
clubs_df=df[['Club',]].drop_duplicates(subset='Club')
clubs_df
Out[143]:
Club
0 FC Barcelona
1 Juventus
2 Paris Saint-Germain
3 Manchester United
4 Manchester City
... ...
12006 Newport County
13054 Sligo Rovers
13369 Derry City
13683 Limerick FC
15950 Bray Wanderers

652 rows × 1 columns

In [144]:
print('Toal Number of clubs : {}'.format(clubs_df.shape[0]))
Toal Number of clubs : 652
In [145]:
#Top 10 positions of maxmium players
df.Position.value_counts().head(10)
Out[145]:
ST     2212
GK     2025
CB     1778
CM     1394
LB     1322
RB     1291
RM     1124
LM     1095
CAM     958
CDM     948
Name: Position, dtype: int64
In [146]:
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline 
import seaborn as sns
sns.set_style("darkgrid")
#matplotlib.rcParams['font.size'] = 14
#matplotlib.rcParams['figure.figsize'] = (20, 10)
#matplotlib.rcParams['figure.facecolor'] = '#00000000'
In [148]:
#Clustering Players by Overall and Potential Rating
player_highest_overall_df=df[['Name','Overall','Potential','Age','Nationality','Club','Preferred Foot']].copy()
player_highest_overall_df=player_highest_overall_df.sort_values('Overall',ascending=False)
player_highest_overall_df
Out[148]:
Name Overall Potential Age Nationality Club Preferred Foot
0 L. Messi 94 94 31 Argentina FC Barcelona Left
1 Cristiano Ronaldo 94 94 33 Portugal Juventus Right
2 Neymar Jr 92 93 26 Brazil Paris Saint-Germain Right
3 De Gea 91 93 27 Spain Manchester United Right
4 K. De Bruyne 91 92 27 Belgium Manchester City Right
... ... ... ... ... ... ... ...
18190 L. Watkins 47 67 18 England Cambridge United Right
18189 A. Kaltner 47 61 18 Germany SpVgg Unterhaching Right
18187 C. Ehlich 47 59 19 Germany SpVgg Unterhaching Right
18186 Zhang Yufeng 47 64 20 China PR Beijing Renhe FC Right
18206 G. Nugent 46 66 16 England Tranmere Rovers Right

18207 rows × 7 columns

In [149]:
#Top 10 players with highest overall rating
top_10_overall_df=player_highest_overall_df.head(10)
top_10_overall_df
Out[149]:
Name Overall Potential Age Nationality Club Preferred Foot
0 L. Messi 94 94 31 Argentina FC Barcelona Left
1 Cristiano Ronaldo 94 94 33 Portugal Juventus Right
2 Neymar Jr 92 93 26 Brazil Paris Saint-Germain Right
3 De Gea 91 93 27 Spain Manchester United Right
4 K. De Bruyne 91 92 27 Belgium Manchester City Right
5 E. Hazard 91 91 27 Belgium Chelsea Right
6 L. Modrić 91 91 32 Croatia Real Madrid Right
7 L. Suárez 91 91 31 Uruguay FC Barcelona Right
8 Sergio Ramos 91 91 32 Spain Real Madrid Right
12 D. Godín 90 90 32 Uruguay Atlético Madrid Right
In [152]:
player_highest_potential_df=df[['Name','Potential','Overall','Age','Nationality','Club','Preferred Foot']].copy()
player_highest_potential_df=player_highest_potential_df.sort_values('Potential',ascending=False)
player_highest_potential_df
Out[152]:
Name Potential Overall Age Nationality Club Preferred Foot
25 K. Mbappé 95 88 19 France Paris Saint-Germain Right
0 L. Messi 94 94 31 Argentina FC Barcelona Left
15 P. Dybala 94 89 24 Argentina Juventus Left
1 Cristiano Ronaldo 94 94 33 Portugal Juventus Right
2 Neymar Jr 93 92 26 Brazil Paris Saint-Germain Right
... ... ... ... ... ... ... ...
17988 Wang Xuanhong 51 51 28 China PR Beijing Renhe FC Right
18043 A. Suzuki 50 50 31 Japan Yokohama F. Marinos Right
18025 J. Miszczuk 50 50 27 Poland Jagiellonia Białystok Right
18183 K. Pilkington 48 48 44 England Cambridge United Right
18171 Y. Uchimura 48 48 33 Japan Hokkaido Consadole Sapporo Right

18207 rows × 7 columns

In [153]:
##Top 10 players with highest potential rating
top_10_potential_df=player_highest_potential_df.head(10)
top_10_potential_df
Out[153]:
Name Potential Overall Age Nationality Club Preferred Foot
25 K. Mbappé 95 88 19 France Paris Saint-Germain Right
0 L. Messi 94 94 31 Argentina FC Barcelona Left
15 P. Dybala 94 89 24 Argentina Juventus Left
1 Cristiano Ronaldo 94 94 33 Portugal Juventus Right
2 Neymar Jr 93 92 26 Brazil Paris Saint-Germain Right
3 De Gea 93 91 27 Spain Manchester United Right
9 J. Oblak 93 90 25 Slovenia Atlético Madrid Right
229 G. Donnarumma 93 82 19 Italy Milan Right
1143 Vinícius Júnior 92 77 17 Brazil Real Madrid Right
155 O. Dembélé 92 83 21 France FC Barcelona Left
In [155]:
#Top 10 players with potential rating + Top 10 players with overall rating
top_10_combined_df=pd.concat([top_10_potential_df,top_10_overall_df], axis=0)
top_10_combined_df=top_10_combined_df.drop_duplicates(subset='Name')
combined_df=top_10_combined_df.groupby('Name')[['Overall','Potential']].max()
combined_df=combined_df.sort_values('Overall',ascending=True)
combined_df
Out[155]:
Overall Potential
Name
Vinícius Júnior 77 92
G. Donnarumma 82 93
O. Dembélé 83 92
K. Mbappé 88 95
P. Dybala 89 94
D. Godín 90 90
J. Oblak 90 93
De Gea 91 93
E. Hazard 91 91
K. De Bruyne 91 92
L. Modrić 91 91
L. Suárez 91 91
Sergio Ramos 91 91
Neymar Jr 92 93
Cristiano Ronaldo 94 94
L. Messi 94 94
In [156]:
combined_df.plot(kind='barh',figsize=(15,10))
plt.title('Overall Rating vs Potential Rating ',fontsize=25)
plt.xlim(0,110,10)
plt.xlabel('Name')
plt.ylabel('Rating')
plt.legend();
In [159]:
#Players Overall and Potential Rating Distribution
plt.figure(figsize=(10,5))
plt.hist([df.Overall,df.Potential],color=['blue','black'],alpha=0.4,stacked=True)
plt.title('Players Overall and Potential Rating Distribution',fontsize=20)
plt.xlabel('Rating',fontsize=20)
plt.ylabel('Players Frequency',fontsize=20)
plt.legend(['overall','potential'],fontsize=20);
In [160]:
#Clustering players By Nationality
#Players number counts country wise
country_count_df=df['Nationality'].value_counts()
country_count_df
Out[160]:
England                 1662
Germany                 1198
Spain                   1072
Argentina                937
France                   914
                        ... 
New Caledonia              1
Fiji                       1
São Tomé & Príncipe        1
United Arab Emirates       1
Botswana                   1
Name: Nationality, Length: 164, dtype: int64
In [161]:
#Let's plot Top 20 countries with maximum number of players in FIFA 21
country_df=pd.DataFrame(country_count_df.head(20))
country_df.sort_values('Nationality',ascending=True).plot(kind='barh',figsize=(15,10),color='lightgreen')
plt.title('Top 20 Countries With Maximum Number of Players in FIFA 21',fontsize=25)
plt.ylabel('Countries',fontsize=20)
plt.xlabel('Number of Players',fontsize=20)
plt.legend(['Number of Players']);
In [163]:
#Clustering players by International Reputation
rep_df=df[['Name','International Reputation']].sort_values('International Reputation',ascending=False)
rep_df
Out[163]:
Name International Reputation
0 L. Messi 5.0
2 Neymar Jr 5.0
22 M. Neuer 5.0
109 Z. Ibrahimović 5.0
7 L. Suárez 5.0
... ... ...
7113 S. Nakatani 1.0
7114 M. Niemeyer 1.0
7115 Léo Silva 1.0
7116 R. Boateng 1.0
18206 G. Nugent 1.0

18207 rows × 2 columns

In [164]:
#Top 10 players with respect to their international reputation
rep_df.head(10)
Out[164]:
Name International Reputation
0 L. Messi 5.0
2 Neymar Jr 5.0
22 M. Neuer 5.0
109 Z. Ibrahimović 5.0
7 L. Suárez 5.0
1 Cristiano Ronaldo 5.0
93 A. Sánchez 4.0
54 Piqué 4.0
53 I. Rakitić 4.0
552 W. Rooney 4.0
In [165]:
#Representation of The Distribution of International Reputation of Players in FIFA 21
#Plotting a pie chart to represent the distribution of international reputation of players in FIFA 21
labels=['1','2','3','4','5']
sizes=df['International Reputation'].value_counts()
colors=['green','blue','yellow','red','black']
explode=[0.1,0.1,0.2,0.5,0.9]
plt.rcParams['figure.figsize']=(10,10)
plt.pie(sizes,labels=labels,colors=colors,explode=explode,shadow=True)
plt.title('International Reputation of The Players in FIFA 21',fontsize=20)
plt.legend()
plt.show()
In [167]:
v=df['International Reputation'].value_counts()
print('Out of all players, {:.2f}% players have international reputation of 1 in FIFA 21'.format(v.iloc[0]/df.shape[0]*100))
Out of all players, 91.06% players have international reputation of 1 in FIFA 21
In [169]:
#Each Different Playing Positions Distribution in FIFA 21
#printing total different positions
c=0
for i in df.Position.unique():
    if ',' not in i:
        print(i)
        c+=1
print('\n\nTotal {} different playing positions.'.format(c))
RF
ST
LW
GK
RCM
LF
RS
RCB
LCM
CB
LDM
CAM
CDM
LS
LCB
RM
LAM
LM
LB
RDM
RW
CM
RB
RAM
CF
RWB
LWB


Total 27 different playing positions.
In [170]:
#Taking each different position players in each seperate dataframes
plis=[df[df.Position.str[-2:]=='GK'].value_counts().sum(),
df[df.Position.str[-2:]=='ST'].value_counts().sum(),
df[df.Position.str[-2:]=='CB'].value_counts().sum(),
df[df.Position.str[-2:]=='LW'].value_counts().sum(),
df[df.Position.str[-2:]=='RW'].value_counts().sum(),
df[df.Position.str[-3:]=='CDM'].value_counts().sum(),
df[df.Position.str[-2:]=='CM'].value_counts().sum(),
df[df.Position.str[-2:]=='RB'].value_counts().sum(),
df[df.Position.str[-2:]=='LB'].value_counts().sum(),
df[df.Position.str[-2:]=='CF'].value_counts().sum(),
df[df.Position.str[-2:]=='LM'].value_counts().sum(),
df[df.Position.str[-2:]=='RM'].value_counts().sum(),
df[df.Position.str[-3:]=='LWB'].value_counts().sum(),
df[df.Position.str[-3:]=='CAM'].value_counts().sum(),
df[df.Position.str[-3:]=='RWB'].value_counts().sum()]
In [174]:
sns.set_theme()
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (15, 8)
In [175]:
from IPython.display import display
with pd.option_context('display.max_columns',None):
    display(df.describe())
Unnamed: 0 ID Age Overall Potential Special International Reputation Weak Foot Skill Moves Jersey Number Crossing Finishing HeadingAccuracy ShortPassing Volleys Dribbling Curve FKAccuracy LongPassing BallControl Acceleration SprintSpeed Agility Reactions Balance ShotPower Jumping Stamina Strength LongShots Aggression Interceptions Positioning Vision Penalties Composure Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Normalized_Wage
count 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000
mean 9103.000000 214298.338606 25.122206 66.238699 71.307299 1597.809908 1.112924 2.947438 2.360356 19.508046 49.734181 45.550911 52.298144 58.686712 42.909026 55.371001 47.170824 42.750151 52.711933 58.369459 64.443730 64.556324 63.336189 61.673587 63.797935 55.313835 64.917834 63.053276 65.139781 46.985775 55.721700 46.575163 49.826770 53.260120 48.420607 58.493656 47.156973 47.572088 45.541056 16.572417 16.348382 16.189268 16.345691 16.666831 9731.312133
std 5256.052511 29965.244204 4.669943 6.908930 6.136496 272.586016 0.393554 0.659591 0.755394 15.935210 18.340299 19.500064 17.356983 14.680105 17.671067 18.885426 18.370998 17.593545 15.307651 16.664584 15.271849 15.002398 15.101839 9.540921 14.493574 17.448546 12.267253 16.201430 12.979956 19.386223 17.580066 20.807859 19.670807 14.391148 15.879699 11.810375 20.025458 21.773243 21.389596 17.692536 16.905507 16.502101 17.032944 17.951898 21999.290406
min 0.000000 16.000000 16.000000 46.000000 48.000000 731.000000 1.000000 1.000000 1.000000 1.000000 5.000000 2.000000 4.000000 7.000000 4.000000 4.000000 6.000000 0.000000 9.000000 5.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 4551.500000 200315.500000 21.000000 62.000000 67.000000 1457.000000 1.000000 3.000000 2.000000 8.000000 38.000000 30.000000 45.000000 54.000000 30.000000 49.000000 34.000000 31.000000 43.000000 54.000000 57.000000 57.000000 55.000000 56.000000 56.000000 45.000000 58.000000 56.000000 58.000000 32.000000 44.000000 26.000000 38.000000 44.000000 39.000000 51.000000 30.000000 26.000000 24.000000 8.000000 8.000000 8.000000 8.000000 8.000000 1000.000000
50% 9103.000000 221759.000000 25.000000 66.000000 71.000000 1635.000000 1.000000 3.000000 2.000000 17.000000 54.000000 49.000000 56.000000 62.000000 44.000000 61.000000 48.000000 41.000000 56.000000 63.000000 67.000000 67.000000 66.000000 62.000000 66.000000 59.000000 66.000000 66.000000 66.000000 51.000000 59.000000 52.000000 55.000000 55.000000 49.000000 59.000000 53.000000 55.000000 52.000000 11.000000 11.000000 11.000000 11.000000 11.000000 3000.000000
75% 13654.500000 236529.500000 28.000000 71.000000 75.000000 1787.000000 1.000000 3.000000 3.000000 26.000000 64.000000 62.000000 64.000000 68.000000 57.000000 68.000000 62.000000 56.000000 64.000000 69.000000 75.000000 75.000000 74.000000 68.000000 74.000000 68.000000 73.000000 74.000000 74.000000 62.000000 69.000000 64.000000 64.000000 64.000000 60.000000 67.000000 64.000000 66.000000 64.000000 14.000000 14.000000 14.000000 14.000000 14.000000 9000.000000
max 18206.000000 246620.000000 45.000000 94.000000 95.000000 2346.000000 5.000000 5.000000 5.000000 99.000000 93.000000 95.000000 94.000000 93.000000 90.000000 97.000000 94.000000 94.000000 93.000000 96.000000 97.000000 96.000000 96.000000 96.000000 96.000000 95.000000 95.000000 96.000000 97.000000 94.000000 95.000000 92.000000 95.000000 94.000000 92.000000 96.000000 94.000000 93.000000 91.000000 90.000000 92.000000 91.000000 90.000000 94.000000 565000.000000
In [179]:
#Filling the missing value for the continuous variables for proper data visualization
df['ShortPassing'].fillna(df['ShortPassing'].mean(), inplace = True)
df['Volleys'].fillna(df['Volleys'].mean(), inplace = True)
df['Dribbling'].fillna(df['Dribbling'].mean(), inplace = True)
df['Curve'].fillna(df['Curve'].mean(), inplace = True)
df['FKAccuracy'].fillna(df['FKAccuracy'], inplace = True)
df['LongPassing'].fillna(df['LongPassing'].mean(), inplace = True)
df['BallControl'].fillna(df['BallControl'].mean(), inplace = True)
df['HeadingAccuracy'].fillna(df['HeadingAccuracy'].mean(), inplace = True)
df['Finishing'].fillna(df['Finishing'].mean(), inplace = True)
df['Crossing'].fillna(df['Crossing'].mean(), inplace = True)
df['Weight'].fillna('200lbs', inplace = True)
df['Contract Valid Until'].fillna(2019, inplace = True)
df['Height'].fillna("5'11", inplace = True)
df['Loaned From'].fillna('None', inplace = True)
df['Joined'].fillna('Jul 1, 2018', inplace = True)
df['Jersey Number'].fillna(8, inplace = True)
df['Body Type'].fillna('Normal', inplace = True)
df['Position'].fillna('ST', inplace = True)
df['Club'].fillna('No Club', inplace = True)
df['Work Rate'].fillna('Medium/ Medium', inplace = True)
df['Skill Moves'].fillna(df['Skill Moves'].median(), inplace = True)
df['Weak Foot'].fillna(3, inplace = True)
df['Preferred Foot'].fillna('Right', inplace = True)
df['International Reputation'].fillna(1, inplace = True)
df['Wage'].fillna('€200K', inplace = True)
df.fillna(0, inplace = True)
In [180]:
def defending(df):
    return int(round((df[['Marking', 'StandingTackle', 
                               'SlidingTackle']].mean()).mean()))

def general(df):
    return int(round((df[['HeadingAccuracy', 'Dribbling', 'Curve', 
                               'BallControl']].mean()).mean()))

def mental(df):
    return int(round((df[['Aggression', 'Interceptions', 'Positioning', 
                               'Vision','Composure']].mean()).mean()))

def passing(df):
    return int(round((df[['Crossing', 'ShortPassing', 
                               'LongPassing']].mean()).mean()))

def mobility(df):
    return int(round((df[['Acceleration', 'SprintSpeed', 
                               'Agility','Reactions']].mean()).mean()))
def power(df):
    return int(round((df[['Balance', 'Jumping', 'Stamina', 
                               'Strength']].mean()).mean()))

def rating(df):
    return int(round((df[['Potential', 'Overall']].mean()).mean()))

def shooting(df):
    return int(round((df[['Finishing', 'Volleys', 'FKAccuracy', 
                               'ShotPower','LongShots', 'Penalties']].mean()).mean()))
In [186]:
#Defining a function for cleaning the Weight data
def extract_value_from(value):
  out = value.replace('lbs', '')
  return float(out)

# applying the function to weight column
#data['value'] = data['value'].apply(lambda x: extract_value_from(x))
df['Weight'] = df['Weight'].apply(lambda x : extract_value_from(x))

df['Weight'].head()
Out[186]:
0    159.0
1    183.0
2    150.0
3    168.0
4    154.0
Name: Weight, dtype: float64
In [187]:
#Defining a function for cleaning the wage column
def extract_value_from(Value):
    out = Value.replace('€', '')
    if 'M' in out:
        out = float(out.replace('M', ''))*1000000
    elif 'K' in Value:
        out = float(out.replace('K', ''))*1000
    return float(out)
In [190]:
#Skill Moves of Players
plt.figure(figsize = (10, 8))
ax = sns.countplot(x = 'Skill Moves', data = df, palette = 'pastel')
ax.set_title(label = 'Count of players on Basis of their skill moves', fontsize = 20)
ax.set_xlabel(xlabel = 'Number of Skill Moves', fontsize = 16)
ax.set_ylabel(ylabel = 'Count', fontsize = 16)
plt.show()
In [191]:
#Height of Players
plt.figure(figsize = (13, 8))
ax = sns.countplot(x = 'Height', data = df, palette = 'dark')
ax.set_title(label = 'Count of players on Basis of Height', fontsize = 20)
ax.set_xlabel(xlabel = 'Height in Foot per inch', fontsize = 16)
ax.set_ylabel(ylabel = 'Count', fontsize = 16)
plt.show()
In [192]:
# Checking for records that contain missing values
null_val = df[df.isnull().any(axis = 1)]
null_val
Out[192]:
Unnamed: 0 ID Name Age Photo Nationality Flag Overall Potential Club ... Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause Normalized_Wage

0 rows × 90 columns

In [193]:
# Overall rating distribution 
plt.figure(figsize=(15,8))
sns.distplot(df['Overall'], bins=15, color='r') 
plt.title('Overall Rating Distribution in FIFA 23', fontsize = 16)
plt.show()
In [196]:
#Correlation Heatmap
#Next up, we want to see the correlation among the 
#relevant player attributes (Overall Rating, Potential Rating, Value, Age, Height, Weight, Wage, Release Clause). 
#Correlation matrices give such valuable insights as they show which attribute is influenced by the other. We will first of all,
#compute the correlation matrix of these relevant attributes.
# compute correlation matrix of the relevant columns 
relevant_col = df[['Overall', 'Potential', 'Age']].corr()
corr = relevant_col.corr()
corr
Out[196]:
Overall Potential Age
Overall 1.000000 0.601936 -0.309101
Potential 0.601936 1.000000 -0.945498
Age -0.309101 -0.945498 1.000000
In [197]:
# Display correlation heatmap
plt.figure(figsize=(15,10))
mask= np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, annot=True, mask=mask, cmap= 'vlag', center=0)
Out[197]:
<Axes: >
In [198]:
#Age vs Overall Rating
#Now we’re going to create a scatter plot showing 
#the relationship between Age and Overall Rating.
# Age vs Overall rating
plt.figure(figsize=(15,8))
sns.scatterplot(x = 'Age', y = 'Overall', data = df)
plt.title('Age vs Overall Rating', fontsize = 15)
plt.xlabel('Age', fontsize = 15)
plt.ylabel('Overall Rating', fontsize = 15)
Out[198]:
Text(0, 0.5, 'Overall Rating')
In [199]:
# The Top 10 ranked by Overall rating 
top_ten_overall = df[['Name','Overall','Age']].nlargest(10, 'Overall')

top_ten_overall
Out[199]:
Name Overall Age
0 L. Messi 94 31
1 Cristiano Ronaldo 94 33
2 Neymar Jr 92 26
3 De Gea 91 27
4 K. De Bruyne 91 27
5 E. Hazard 91 27
6 L. Modrić 91 32
7 L. Suárez 91 31
8 Sergio Ramos 91 32
9 J. Oblak 90 25
In [202]:
#Count of Players on Basis of Age
df['Age'].value_counts()
Out[202]:
21    1423
26    1387
24    1358
22    1340
23    1332
25    1319
20    1240
27    1162
28    1101
19    1024
29     959
30     917
18     732
31     707
32     574
33     408
34     404
17     289
35     196
36     127
37      82
16      42
38      37
39      25
40      13
41       5
44       2
45       1
42       1
Name: Age, dtype: int64
In [204]:
plt.figure(figsize =(15,7))
sns.countplot(x = 'Age', data = df)
plt.title('Count of Players on the Basis of Age (FIFA 23)', fontsize = 16)
plt.xlabel('Age', fontsize = 16)
plt.ylabel('Count', fontsize = 16)
Out[204]:
Text(0, 0.5, 'Count')
In [207]:
library("DALEX")
df_gbm_exp_deep <- DALEX::explain(df_gbm_deep, 
        df = fifa_small, y = 10^fifa_small$LogValue, 
        predict_function = function(m,x) 10^predict(m, x, n.trees = 250),
        label = "GBM deep")
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
import numpy as np

X = df.drop(["Nationality", "Overall", "Potential", 
     "Value", "Wage"], axis = 1)
y = df['Value']
ylog = np.log(y)

X_train, X_test, ylog_train, ylog_test, y_train, y_test =
     train_test_split(X, ylog, y, test_size = 0.25, random_state = 4)
gbm_model = LGBMRegressor()
gbm_model.fit(X_train, ylog_train, verbose = False)
  Cell In[207], line 2
    df_gbm_exp_deep <- DALEX::explain(df_gbm_deep,
                            ^
SyntaxError: invalid syntax
In [208]:
df.describe().apply(lambda s: s.apply(lambda x: format(x,
'f')))
Out[208]:
Unnamed: 0 ID Age Overall Potential Special International Reputation Weak Foot Skill Moves Jersey Number ... Composure Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Normalized_Wage
count 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 ... 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000
mean 9103.000000 214298.338606 25.122206 66.238699 71.307299 1597.809908 1.112924 2.947438 2.360356 19.508046 ... 58.493656 47.156973 47.572088 45.541056 16.572417 16.348382 16.189268 16.345691 16.666831 9731.312133
std 5256.052511 29965.244204 4.669943 6.908930 6.136496 272.586016 0.393554 0.659591 0.755394 15.935210 ... 11.810375 20.025458 21.773243 21.389596 17.692536 16.905507 16.502101 17.032944 17.951898 21999.290406
min 0.000000 16.000000 16.000000 46.000000 48.000000 731.000000 1.000000 1.000000 1.000000 1.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 4551.500000 200315.500000 21.000000 62.000000 67.000000 1457.000000 1.000000 3.000000 2.000000 8.000000 ... 51.000000 30.000000 26.000000 24.000000 8.000000 8.000000 8.000000 8.000000 8.000000 1000.000000
50% 9103.000000 221759.000000 25.000000 66.000000 71.000000 1635.000000 1.000000 3.000000 2.000000 17.000000 ... 59.000000 53.000000 55.000000 52.000000 11.000000 11.000000 11.000000 11.000000 11.000000 3000.000000
75% 13654.500000 236529.500000 28.000000 71.000000 75.000000 1787.000000 1.000000 3.000000 3.000000 26.000000 ... 67.000000 64.000000 66.000000 64.000000 14.000000 14.000000 14.000000 14.000000 14.000000 9000.000000
max 18206.000000 246620.000000 45.000000 94.000000 95.000000 2346.000000 5.000000 5.000000 5.000000 99.000000 ... 96.000000 94.000000 93.000000 91.000000 90.000000 92.000000 91.000000 90.000000 94.000000 565000.000000

8 rows × 46 columns

In [211]:
player_name = df[["Acceleration","Name","Position","Age" , "Nationality","SprintSpeed"]].nlargest(7, ["Acceleration"]).set_index("Name")
player_name
Out[211]:
Acceleration Position Age Nationality SprintSpeed
Name
Douglas Costa 97.0 LM 27 Brazil 93.0
Adama 97.0 RW 22 Spain 96.0
K. Mbappé 96.0 RM 19 France 96.0
K. Manneh 96.0 LM 23 United States 93.0
S. Mané 95.0 LM 26 Senegal 93.0
R. Sterling 95.0 RW 23 England 92.0
K. Coman 95.0 LM 22 France 93.0
In [213]:
#Players Overall and Potential Rating vs Age in FIFA 21
fig, axes = plt.subplots(1, 2, sharex=True, figsize=(18,10))
fig.suptitle('Overall and Potential Rating vs Age',fontsize=20)
axes[0].set_title('Overall vs Age')
axes[1].set_title('Potential vs Age')
sns.scatterplot(ax=axes[0],x=df.Age,y=df.Overall);
sns.scatterplot(x=df.Age,y=df.Potential);
In [233]:
# Removing spaces in the column names to enable easy column reference  
df.columns = df.columns.str.replace(' ','')
In [236]:
df.head(5)
Out[236]:
Unnamed:0 ID Name Age Photo Nationality Flag Overall Potential Club ... Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes ReleaseClause Normalized_Wage
0 0 158023 L. Messi 31 https://cdn.sofifa.org/players/4/19/158023.png Argentina https://cdn.sofifa.org/flags/52.png 94 94 FC Barcelona ... 33.0 28.0 26.0 6.0 11.0 15.0 14.0 8.0 NaN 565000.0
1 1 20801 Cristiano Ronaldo 33 https://cdn.sofifa.org/players/4/19/20801.png Portugal https://cdn.sofifa.org/flags/38.png 94 94 Juventus ... 28.0 31.0 23.0 7.0 11.0 15.0 14.0 11.0 NaN 405000.0
2 2 190871 Neymar Jr 26 https://cdn.sofifa.org/players/4/19/190871.png Brazil https://cdn.sofifa.org/flags/54.png 92 93 Paris Saint-Germain ... 27.0 24.0 33.0 9.0 9.0 15.0 15.0 11.0 8.1 290000.0
3 3 193080 De Gea 27 https://cdn.sofifa.org/players/4/19/193080.png Spain https://cdn.sofifa.org/flags/45.png 91 93 Manchester United ... 15.0 21.0 13.0 90.0 85.0 87.0 88.0 94.0 NaN 260000.0
4 4 192985 K. De Bruyne 27 https://cdn.sofifa.org/players/4/19/192985.png Belgium https://cdn.sofifa.org/flags/7.png 91 92 Manchester City ... 68.0 58.0 51.0 15.0 13.0 5.0 10.0 13.0 NaN 355000.0

5 rows × 90 columns

In [237]:
df.tail()
Out[237]:
Unnamed:0 ID Name Age Photo Nationality Flag Overall Potential Club ... Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes ReleaseClause Normalized_Wage
18202 18202 238813 J. Lundstram 19 https://cdn.sofifa.org/players/4/19/238813.png England https://cdn.sofifa.org/flags/14.png 47 65 Crewe Alexandra ... 40.0 48.0 47.0 10.0 13.0 7.0 8.0 9.0 NaN 1000.0
18203 18203 243165 N. Christoffersson 19 https://cdn.sofifa.org/players/4/19/243165.png Sweden https://cdn.sofifa.org/flags/46.png 47 63 Trelleborgs FF ... 22.0 15.0 19.0 10.0 9.0 9.0 5.0 12.0 NaN 1000.0
18204 18204 241638 B. Worman 16 https://cdn.sofifa.org/players/4/19/241638.png England https://cdn.sofifa.org/flags/14.png 47 67 Cambridge United ... 32.0 13.0 11.0 6.0 5.0 10.0 6.0 13.0 NaN 1000.0
18205 18205 246268 D. Walker-Rice 17 https://cdn.sofifa.org/players/4/19/246268.png England https://cdn.sofifa.org/flags/14.png 47 66 Tranmere Rovers ... 20.0 25.0 27.0 14.0 6.0 14.0 8.0 9.0 NaN 1000.0
18206 18206 246269 G. Nugent 16 https://cdn.sofifa.org/players/4/19/246269.png England https://cdn.sofifa.org/flags/14.png 46 66 Tranmere Rovers ... 40.0 43.0 50.0 10.0 15.0 9.0 12.0 9.0 NaN 1000.0

5 rows × 90 columns

In [238]:
df.info
Out[238]:
<bound method DataFrame.info of        Unnamed:0      ID                Name  Age  \
0              0  158023            L. Messi   31   
1              1   20801   Cristiano Ronaldo   33   
2              2  190871           Neymar Jr   26   
3              3  193080              De Gea   27   
4              4  192985        K. De Bruyne   27   
...          ...     ...                 ...  ...   
18202      18202  238813        J. Lundstram   19   
18203      18203  243165  N. Christoffersson   19   
18204      18204  241638           B. Worman   16   
18205      18205  246268      D. Walker-Rice   17   
18206      18206  246269           G. Nugent   16   

                                                Photo Nationality  \
0      https://cdn.sofifa.org/players/4/19/158023.png   Argentina   
1       https://cdn.sofifa.org/players/4/19/20801.png    Portugal   
2      https://cdn.sofifa.org/players/4/19/190871.png      Brazil   
3      https://cdn.sofifa.org/players/4/19/193080.png       Spain   
4      https://cdn.sofifa.org/players/4/19/192985.png     Belgium   
...                                               ...         ...   
18202  https://cdn.sofifa.org/players/4/19/238813.png     England   
18203  https://cdn.sofifa.org/players/4/19/243165.png      Sweden   
18204  https://cdn.sofifa.org/players/4/19/241638.png     England   
18205  https://cdn.sofifa.org/players/4/19/246268.png     England   
18206  https://cdn.sofifa.org/players/4/19/246269.png     England   

                                      Flag  Overall  Potential  \
0      https://cdn.sofifa.org/flags/52.png       94         94   
1      https://cdn.sofifa.org/flags/38.png       94         94   
2      https://cdn.sofifa.org/flags/54.png       92         93   
3      https://cdn.sofifa.org/flags/45.png       91         93   
4       https://cdn.sofifa.org/flags/7.png       91         92   
...                                    ...      ...        ...   
18202  https://cdn.sofifa.org/flags/14.png       47         65   
18203  https://cdn.sofifa.org/flags/46.png       47         63   
18204  https://cdn.sofifa.org/flags/14.png       47         67   
18205  https://cdn.sofifa.org/flags/14.png       47         66   
18206  https://cdn.sofifa.org/flags/14.png       46         66   

                      Club  ... Marking  StandingTackle  SlidingTackle  \
0             FC Barcelona  ...    33.0            28.0           26.0   
1                 Juventus  ...    28.0            31.0           23.0   
2      Paris Saint-Germain  ...    27.0            24.0           33.0   
3        Manchester United  ...    15.0            21.0           13.0   
4          Manchester City  ...    68.0            58.0           51.0   
...                    ...  ...     ...             ...            ...   
18202      Crewe Alexandra  ...    40.0            48.0           47.0   
18203       Trelleborgs FF  ...    22.0            15.0           19.0   
18204     Cambridge United  ...    32.0            13.0           11.0   
18205      Tranmere Rovers  ...    20.0            25.0           27.0   
18206      Tranmere Rovers  ...    40.0            43.0           50.0   

       GKDiving GKHandling  GKKicking  GKPositioning  GKReflexes  \
0           6.0       11.0       15.0           14.0         8.0   
1           7.0       11.0       15.0           14.0        11.0   
2           9.0        9.0       15.0           15.0        11.0   
3          90.0       85.0       87.0           88.0        94.0   
4          15.0       13.0        5.0           10.0        13.0   
...         ...        ...        ...            ...         ...   
18202      10.0       13.0        7.0            8.0         9.0   
18203      10.0        9.0        9.0            5.0        12.0   
18204       6.0        5.0       10.0            6.0        13.0   
18205      14.0        6.0       14.0            8.0         9.0   
18206      10.0       15.0        9.0           12.0         9.0   

      ReleaseClause Normalized_Wage  
0               NaN        565000.0  
1               NaN        405000.0  
2               8.1        290000.0  
3               NaN        260000.0  
4               NaN        355000.0  
...             ...             ...  
18202           NaN          1000.0  
18203           NaN          1000.0  
18204           NaN          1000.0  
18205           NaN          1000.0  
18206           NaN          1000.0  

[18207 rows x 90 columns]>
In [240]:
import warnings
import numpy as np
import pandas as pd
from pyod.models.mad import MAD
from pyod.models.knn import KNN
from pyod.models.lof import LOF
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
In [243]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
In [244]:
#Parametric methods: Univariate
df.hist(figsize=(6,6));
In [245]:
# enough variation between features to show outliers
df.describe()
Out[245]:
Unnamed:0 ID Age Overall Potential Value Wage Special InternationalReputation WeakFoot ... Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes ReleaseClause Normalized_Wage
count 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 0.0 0.0 18207.000000 18207.000000 18207.000000 ... 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 34.000000 18207.000000
mean 9103.000000 214298.338606 25.122206 66.238699 71.307299 NaN NaN 1597.809908 1.112924 2.947438 ... 47.156973 47.572088 45.541056 16.572417 16.348382 16.189268 16.345691 16.666831 0.752941 9731.312133
std 5256.052511 29965.244204 4.669943 6.908930 6.136496 NaN NaN 272.586016 0.393554 0.659591 ... 20.025458 21.773243 21.389596 17.692536 16.905507 16.502101 17.032944 17.951898 1.329670 21999.290406
min 0.000000 16.000000 16.000000 46.000000 48.000000 NaN NaN 731.000000 1.000000 1.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.100000 0.000000
25% 4551.500000 200315.500000 21.000000 62.000000 67.000000 NaN NaN 1457.000000 1.000000 3.000000 ... 30.000000 26.000000 24.000000 8.000000 8.000000 8.000000 8.000000 8.000000 0.200000 1000.000000
50% 9103.000000 221759.000000 25.000000 66.000000 71.000000 NaN NaN 1635.000000 1.000000 3.000000 ... 53.000000 55.000000 52.000000 11.000000 11.000000 11.000000 11.000000 11.000000 0.700000 3000.000000
75% 13654.500000 236529.500000 28.000000 71.000000 75.000000 NaN NaN 1787.000000 1.000000 3.000000 ... 64.000000 66.000000 64.000000 14.000000 14.000000 14.000000 14.000000 14.000000 0.800000 9000.000000
max 18206.000000 246620.000000 45.000000 94.000000 95.000000 NaN NaN 2346.000000 5.000000 5.000000 ... 94.000000 93.000000 91.000000 90.000000 92.000000 91.000000 90.000000 94.000000 8.100000 565000.000000

8 rows × 49 columns

In [246]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv("/Users/prose/OneDrive/Desktop/Data/data.csv")
df.head()
Out[246]:
Unnamed: 0 ID Name Age Photo Nationality Flag Overall Potential Club ... Composure Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
0 0 158023 L. Messi 31 https://cdn.sofifa.org/players/4/19/158023.png Argentina https://cdn.sofifa.org/flags/52.png 94 94 FC Barcelona ... 96.0 33.0 28.0 26.0 6.0 11.0 15.0 14.0 8.0 €226.5M
1 1 20801 Cristiano Ronaldo 33 https://cdn.sofifa.org/players/4/19/20801.png Portugal https://cdn.sofifa.org/flags/38.png 94 94 Juventus ... 95.0 28.0 31.0 23.0 7.0 11.0 15.0 14.0 11.0 €127.1M
2 2 190871 Neymar Jr 26 https://cdn.sofifa.org/players/4/19/190871.png Brazil https://cdn.sofifa.org/flags/54.png 92 93 Paris Saint-Germain ... 94.0 27.0 24.0 33.0 9.0 9.0 15.0 15.0 11.0 €228.1M
3 3 193080 De Gea 27 https://cdn.sofifa.org/players/4/19/193080.png Spain https://cdn.sofifa.org/flags/45.png 91 93 Manchester United ... 68.0 15.0 21.0 13.0 90.0 85.0 87.0 88.0 94.0 €138.6M
4 4 192985 K. De Bruyne 27 https://cdn.sofifa.org/players/4/19/192985.png Belgium https://cdn.sofifa.org/flags/7.png 91 92 Manchester City ... 88.0 68.0 58.0 51.0 15.0 13.0 5.0 10.0 13.0 €196.4M

5 rows × 89 columns

In [247]:
# enough variation between features to show outliers
df.describe()
Out[247]:
Unnamed: 0 ID Age Overall Potential Special International Reputation Weak Foot Skill Moves Jersey Number ... Penalties Composure Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes
count 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18207.000000 18159.000000 18159.000000 18159.000000 18147.000000 ... 18159.000000 18159.000000 18159.000000 18159.000000 18159.000000 18159.000000 18159.000000 18159.000000 18159.000000 18159.000000
mean 9103.000000 214298.338606 25.122206 66.238699 71.307299 1597.809908 1.113222 2.947299 2.361308 19.546096 ... 48.548598 58.648274 47.281623 47.697836 45.661435 16.616223 16.391596 16.232061 16.388898 16.710887
std 5256.052511 29965.244204 4.669943 6.908930 6.136496 272.586016 0.394031 0.660456 0.756164 15.947765 ... 15.704053 11.436133 19.904397 21.664004 21.289135 17.695349 16.906900 16.502864 17.034669 17.955119
min 0.000000 16.000000 16.000000 46.000000 48.000000 731.000000 1.000000 1.000000 1.000000 1.000000 ... 5.000000 3.000000 3.000000 2.000000 3.000000 1.000000 1.000000 1.000000 1.000000 1.000000
25% 4551.500000 200315.500000 21.000000 62.000000 67.000000 1457.000000 1.000000 3.000000 2.000000 8.000000 ... 39.000000 51.000000 30.000000 27.000000 24.000000 8.000000 8.000000 8.000000 8.000000 8.000000
50% 9103.000000 221759.000000 25.000000 66.000000 71.000000 1635.000000 1.000000 3.000000 2.000000 17.000000 ... 49.000000 60.000000 53.000000 55.000000 52.000000 11.000000 11.000000 11.000000 11.000000 11.000000
75% 13654.500000 236529.500000 28.000000 71.000000 75.000000 1787.000000 1.000000 3.000000 3.000000 26.000000 ... 60.000000 67.000000 64.000000 66.000000 64.000000 14.000000 14.000000 14.000000 14.000000 14.000000
max 18206.000000 246620.000000 45.000000 94.000000 95.000000 2346.000000 5.000000 5.000000 5.000000 99.000000 ... 92.000000 96.000000 94.000000 93.000000 91.000000 90.000000 92.000000 91.000000 90.000000 94.000000

8 rows × 44 columns

In [248]:
#Parametric methods: Univariate
df.hist(figsize=(6,6));
In [249]:
def out_std(s, nstd=3.0, return_thresholds=False):
    """
    Return a boolean mask of outliers for a series
    using standard deviation, works column-wise.
    param nstd:
        Set number of standard deviations from the mean
        to consider an outlier
    :type nstd: ``float``
    param return_thresholds:
        True returns the lower and upper bounds, good for plotting.
        False returns the masked array 
    :type return_thresholds: ``bool``
    """
    data_mean, data_std = s.mean(), s.std()
    cut_off = df_std * nstd
    lower, upper = df_mean - cut_off, df_mean + cut_off
    if return_thresholds:
        return lower, upper
    else:
        return [True if x < lower or x > upper else False for x in s]

def out_iqr(s, k=1.5, return_thresholds=False):
    """
    Return a boolean mask of outliers for a series
    using interquartile range, works column-wise.
    param k:
        some cutoff to multiply by the iqr
    :type k: ``float``
    param return_thresholds:
        True returns the lower and upper bounds, good for plotting.
        False returns the masked array 
    :type return_thresholds: ``bool``
    """
    # calculate interquartile range
    q25, q75 = np.percentile(s, 25), np.percentile(s, 75)
    iqr = q75 - q25
    # calculate the outlier cutoff
    cut_off = iqr * k
    lower, upper = q25 - cut_off, q75 + cut_off
    if return_thresholds:
        return lower, upper
    else: # identify outliers
        return [True if x < lower or x > upper else False for x in s]
In [250]:
df['Height'].describe()
Out[250]:
count     18159
unique       21
top         6'0
freq       2881
Name: Height, dtype: object
In [251]:
df['Weight'].describe()
Out[251]:
count      18159
unique        57
top       165lbs
freq        1483
Name: Weight, dtype: object
In [252]:
# Create a histogram using Seaborn
g = sns.histplot(data = df, x = 'Weight')

# Add labels
g.set_xlabel('Weight Distribution')
Out[252]:
Text(0.5, 0, 'Weight Distribution')
In [253]:
# Create a histogram using Seaborn
g = sns.histplot(data = df, x = 'Height')

# Add labels
g.set_xlabel('Height Distribution')
Out[253]:
Text(0.5, 0, 'Height Distribution')
In [256]:
df['Weight'] = df['Weight'].str.replace('lbs', '')
In [257]:
df['Value'] = df['Value'].str.replace('M', '')
In [259]:
df["Value"].head()
Out[259]:
0    €110.5
1       €77
2    €118.5
3       €72
4      €102
Name: Value, dtype: object
In [260]:
df["Weight"].head()
Out[260]:
0    159
1    183
2    150
3    168
4    154
Name: Weight, dtype: object
In [263]:
# Box Plot
import seaborn as sns
sns.boxplot(df['Weight'])
Out[263]:
<Axes: >
In [267]:
import pandas as pd

# Use Regular Expression to convert Height from feet + inches into cm. Convert from text into integer
r = re.compile(r"([0-9]+)'([0-9]+)")    # to set the pattern e.g. 5'7 , 4'12 etc
def get_cm(height):
    height = str(height)
    m = r.match(height)
    if m == None:
        return float('NaN')
    else:
        return float(m.group(1))*30.48 + float(m.group(2))*2.54
df["Height"] = df["Height"].apply(lambda x:get_cm(x))
In [268]:
# Box Plot
import seaborn as sns
sns.boxplot(df['Height'])
Out[268]:
<Axes: >
In [273]:
# Import zscore function
from scipy.stats import zscore

# Calculate z-score for each data point and compute its absolute value
z_scores = zscore(df['Height'])
abs_z_scores = np.abs(z_scores)

# Select the outliers using a threshold of 3
outliers = df[abs_z_scores > 3]
outliers.head()
Out[273]:
Unnamed: 0 ID Name Age Photo Nationality Flag Overall Potential Club ... Composure Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause

0 rows × 89 columns

In [274]:
# Import the median_abs_deviation function
from scipy.stats import median_abs_deviation

# Obtain the MAD value
mad_score = median_abs_deviation(df['Height'])
mad_score
Out[274]:
nan
In [277]:
# Import the median_abs_deviation function
from scipy.stats import median_abs_deviation

# Obtain the MAD value
mad_score = median_abs_deviation(df['Age'])
mad_score
Out[277]:
4.0
In [280]:
# Calculate the percentiles
seventy_fifth = df['Age'].quantile(0.75)
twenty_fifth = df['Age'].quantile(0.25)

# Obtain IQR
iqr = seventy_fifth - twenty_fifth

# Upper and lower thresholds
upper = seventy_fifth + (1.5 * iqr)
lower = twenty_fifth - (1.5 * iqr)

# Subset the dataset
outliers = df[(df['Age'] < lower) | (df['Age'] > upper)]
outliers.head()
Out[280]:
Unnamed: 0 ID Name Age Photo Nationality Flag Overall Potential Club ... Composure Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
41 41 1179 G. Buffon 40 https://cdn.sofifa.org/players/4/19/1179.png Italy https://cdn.sofifa.org/flags/27.png 88 88 Paris Saint-Germain ... 70.0 13.0 11.0 11.0 88.0 87.0 74.0 90.0 83.0 €7.4M
554 554 49031 S. Sorrentino 39 https://cdn.sofifa.org/players/4/19/49031.png Italy https://cdn.sofifa.org/flags/27.png 80 80 Chievo Verona ... 66.0 25.0 10.0 13.0 81.0 82.0 66.0 82.0 79.0 €1.7M
864 864 153260 Hilton 40 https://cdn.sofifa.org/players/4/19/153260.png Brazil https://cdn.sofifa.org/flags/54.png 78 78 Montpellier HSC ... 70.0 83.0 77.0 76.0 12.0 7.0 11.0 12.0 13.0 NaN
1120 1120 156092 J. Villar 41 https://cdn.sofifa.org/players/4/19/156092.png Paraguay https://cdn.sofifa.org/flags/58.png 77 77 NaN ... 55.0 13.0 13.0 14.0 75.0 75.0 74.0 78.0 77.0 NaN
1294 1294 14907 A. Bizzarri 40 https://cdn.sofifa.org/players/4/19/14907.png Argentina https://cdn.sofifa.org/flags/52.png 76 76 Foggia ... 60.0 11.0 12.0 11.0 76.0 74.0 66.0 82.0 76.0 €840K

5 rows × 89 columns

In [281]:
df['Weight'].head()
Out[281]:
0    159
1    183
2    150
3    168
4    154
Name: Weight, dtype: object
In [282]:
df['Age'].head()
Out[282]:
0    31
1    33
2    26
3    27
4    27
Name: Age, dtype: int64
In [283]:
# Convert the 'Weight' column to int64
df['Weight'] = pd.to_numeric(df['Weight'], errors='coerce').astype('Int64')

# Display the DataFrame with 'Weight' column updated to int64
print(df)
       Unnamed: 0      ID                Name  Age  \
0               0  158023            L. Messi   31   
1               1   20801   Cristiano Ronaldo   33   
2               2  190871           Neymar Jr   26   
3               3  193080              De Gea   27   
4               4  192985        K. De Bruyne   27   
...           ...     ...                 ...  ...   
18202       18202  238813        J. Lundstram   19   
18203       18203  243165  N. Christoffersson   19   
18204       18204  241638           B. Worman   16   
18205       18205  246268      D. Walker-Rice   17   
18206       18206  246269           G. Nugent   16   

                                                Photo Nationality  \
0      https://cdn.sofifa.org/players/4/19/158023.png   Argentina   
1       https://cdn.sofifa.org/players/4/19/20801.png    Portugal   
2      https://cdn.sofifa.org/players/4/19/190871.png      Brazil   
3      https://cdn.sofifa.org/players/4/19/193080.png       Spain   
4      https://cdn.sofifa.org/players/4/19/192985.png     Belgium   
...                                               ...         ...   
18202  https://cdn.sofifa.org/players/4/19/238813.png     England   
18203  https://cdn.sofifa.org/players/4/19/243165.png      Sweden   
18204  https://cdn.sofifa.org/players/4/19/241638.png     England   
18205  https://cdn.sofifa.org/players/4/19/246268.png     England   
18206  https://cdn.sofifa.org/players/4/19/246269.png     England   

                                      Flag  Overall  Potential  \
0      https://cdn.sofifa.org/flags/52.png       94         94   
1      https://cdn.sofifa.org/flags/38.png       94         94   
2      https://cdn.sofifa.org/flags/54.png       92         93   
3      https://cdn.sofifa.org/flags/45.png       91         93   
4       https://cdn.sofifa.org/flags/7.png       91         92   
...                                    ...      ...        ...   
18202  https://cdn.sofifa.org/flags/14.png       47         65   
18203  https://cdn.sofifa.org/flags/46.png       47         63   
18204  https://cdn.sofifa.org/flags/14.png       47         67   
18205  https://cdn.sofifa.org/flags/14.png       47         66   
18206  https://cdn.sofifa.org/flags/14.png       46         66   

                      Club  ... Composure Marking StandingTackle  \
0             FC Barcelona  ...      96.0    33.0           28.0   
1                 Juventus  ...      95.0    28.0           31.0   
2      Paris Saint-Germain  ...      94.0    27.0           24.0   
3        Manchester United  ...      68.0    15.0           21.0   
4          Manchester City  ...      88.0    68.0           58.0   
...                    ...  ...       ...     ...            ...   
18202      Crewe Alexandra  ...      45.0    40.0           48.0   
18203       Trelleborgs FF  ...      42.0    22.0           15.0   
18204     Cambridge United  ...      41.0    32.0           13.0   
18205      Tranmere Rovers  ...      46.0    20.0           25.0   
18206      Tranmere Rovers  ...      43.0    40.0           43.0   

       SlidingTackle GKDiving  GKHandling  GKKicking  GKPositioning  \
0               26.0      6.0        11.0       15.0           14.0   
1               23.0      7.0        11.0       15.0           14.0   
2               33.0      9.0         9.0       15.0           15.0   
3               13.0     90.0        85.0       87.0           88.0   
4               51.0     15.0        13.0        5.0           10.0   
...              ...      ...         ...        ...            ...   
18202           47.0     10.0        13.0        7.0            8.0   
18203           19.0     10.0         9.0        9.0            5.0   
18204           11.0      6.0         5.0       10.0            6.0   
18205           27.0     14.0         6.0       14.0            8.0   
18206           50.0     10.0        15.0        9.0           12.0   

      GKReflexes Release Clause  
0            8.0        €226.5M  
1           11.0        €127.1M  
2           11.0        €228.1M  
3           94.0        €138.6M  
4           13.0        €196.4M  
...          ...            ...  
18202        9.0          €143K  
18203       12.0          €113K  
18204       13.0          €165K  
18205        9.0          €143K  
18206        9.0          €165K  

[18207 rows x 89 columns]
In [284]:
# Calculate the percentiles
seventy_fifth = df['Weight'].quantile(0.75)
twenty_fifth = df['Weight'].quantile(0.25)

# Obtain IQR
iqr = seventy_fifth - twenty_fifth

# Upper and lower thresholds
upper = seventy_fifth + (1.5 * iqr)
lower = twenty_fifth - (1.5 * iqr)

# Subset the dataset
outliers = df[(df['Weight'] < lower) | (df['Weight'] > upper)]
outliers.head()
Out[284]:
Unnamed: 0 ID Name Age Photo Nationality Flag Overall Potential Club ... Composure Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
19 19 192119 T. Courtois 26 https://cdn.sofifa.org/players/4/19/192119.png Belgium https://cdn.sofifa.org/flags/7.png 89 90 Real Madrid ... 66.0 20.0 18.0 16.0 85.0 91.0 72.0 86.0 88.0 €113.7M
115 115 212190 N. Süle 22 https://cdn.sofifa.org/players/4/19/212190.png Germany https://cdn.sofifa.org/flags/21.png 84 90 FC Bayern München ... 72.0 82.0 85.0 83.0 15.0 7.0 14.0 7.0 15.0 €67.5M
165 165 213331 J. Tah 22 https://cdn.sofifa.org/players/4/19/213331.png Germany https://cdn.sofifa.org/flags/21.png 83 88 Bayer 04 Leverkusen ... 75.0 80.0 88.0 84.0 11.0 8.0 7.0 9.0 14.0 €52.4M
210 210 179783 R. Fährmann 29 https://cdn.sofifa.org/players/4/19/179783.png Germany https://cdn.sofifa.org/flags/21.png 83 84 FC Schalke 04 ... 61.0 10.0 12.0 10.0 83.0 81.0 52.0 82.0 87.0 €35.5M
259 259 203263 H. Maguire 25 https://cdn.sofifa.org/players/4/19/203263.png England https://cdn.sofifa.org/flags/14.png 82 85 Leicester City ... 79.0 81.0 84.0 81.0 14.0 16.0 9.0 14.0 9.0 €46.4M

5 rows × 89 columns

In [286]:
# Remove rows with NA values in the 'Weight' column
df = df.dropna(subset=['Weight'])
In [287]:
# Import the median_abs_deviation function
from scipy.stats import median_abs_deviation

# Obtain the MAD value
mad_score = median_abs_deviation(df['Weight'])
mad_score
Out[287]:
11.0
In [288]:
# Box Plot
import seaborn as sns
sns.boxplot(df['Weight'])
Out[288]:
<Axes: >
In [290]:
# Create a box plot
g = sns.boxplot(data = df, x = 'Weight')

# Add a title and change xlabel
g.set_title('Box Plot of Weight')
g.set_xlabel('Weight')
Out[290]:
Text(0.5, 0, 'Weight')
In [292]:
df['Height'].head()
Out[292]:
0    170.18
1    187.96
2    175.26
3    193.04
4    180.34
Name: Height, dtype: float64
In [293]:
# Calculate the percentiles
seventy_fifth = df['Height'].quantile(0.75)
twenty_fifth = df['Height'].quantile(0.25)

# Obtain IQR
iqr = seventy_fifth - twenty_fifth

# Upper and lower thresholds
upper = seventy_fifth + (1.5 * iqr)
lower = twenty_fifth - (1.5 * iqr)

# Subset the dataset
outliers = df[(df['Height'] < lower) | (df['Height'] > upper)]
outliers.head()
Out[293]:
Unnamed: 0 ID Name Age Photo Nationality Flag Overall Potential Club ... Composure Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
463 463 203841 N. Pope 26 https://cdn.sofifa.org/players/4/19/203841.png England https://cdn.sofifa.org/flags/14.png 80 83 Burnley ... 62.0 14.0 15.0 14.0 79.0 80.0 74.0 80.0 80.0 €25.7M
1204 1204 220932 L. Kalinić 28 https://cdn.sofifa.org/players/4/19/220932.png Croatia https://cdn.sofifa.org/flags/10.png 76 78 KAA Gent ... 58.0 13.0 17.0 17.0 81.0 75.0 42.0 76.0 80.0 €10.2M
1340 1340 183895 M. Moralez 31 https://cdn.sofifa.org/players/4/19/183895.png Argentina https://cdn.sofifa.org/flags/52.png 76 76 New York City FC ... 76.0 48.0 58.0 51.0 6.0 5.0 14.0 14.0 9.0 €10.5M
1454 1454 172203 F. Forster 30 https://cdn.sofifa.org/players/4/19/172203.png England https://cdn.sofifa.org/flags/14.png 76 77 Southampton ... 56.0 11.0 13.0 12.0 72.0 76.0 64.0 77.0 78.0 €10.9M
2493 2493 202184 J. Plata 26 https://cdn.sofifa.org/players/4/19/202184.png Ecuador https://cdn.sofifa.org/flags/57.png 74 74 Real Salt Lake ... 70.0 34.0 29.0 27.0 11.0 15.0 9.0 13.0 13.0 €9M

5 rows × 89 columns

In [294]:
# Import the median_abs_deviation function
from scipy.stats import median_abs_deviation

# Obtain the MAD value
mad_score = median_abs_deviation(df['Height'])
mad_score
Out[294]:
5.079999999999984
In [295]:
# Import the median_abs_deviation function
from scipy.stats import median_abs_deviation

# Obtain the MAD value
mad_score = median_abs_deviation(df['Weight'])
mad_score
Out[295]:
11.0
In [296]:
# Create a box plot
g = sns.boxplot(data = df, x = 'Height')

# Add a title and change xlabel
g.set_title('Box Plot of Height')
g.set_xlabel('Height')
Out[296]:
Text(0.5, 0, 'Height')
In [297]:
# Import zscore function
from scipy.stats import zscore

# Calculate z-score for each data point and compute its absolute value
z_scores = zscore(df['Height'])
abs_z_scores = np.abs(z_scores)

# Select the outliers using a threshold of 3
outliers = df[abs_z_scores > 3]
outliers.head()
Out[297]:
Unnamed: 0 ID Name Age Photo Nationality Flag Overall Potential Club ... Composure Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
1132 1132 221634 L. Acosta 24 https://cdn.sofifa.org/players/4/19/221634.png Argentina https://cdn.sofifa.org/flags/52.png 77 79 DC United ... 74.0 45.0 47.0 31.0 13.0 9.0 6.0 12.0 10.0 €17.3M
1165 1165 170719 D. Buonanotte 30 https://cdn.sofifa.org/players/4/19/170719.png Argentina https://cdn.sofifa.org/flags/52.png 77 77 Universidad Católica ... 71.0 35.0 31.0 18.0 15.0 16.0 16.0 9.0 13.0 €11.9M
1340 1340 183895 M. Moralez 31 https://cdn.sofifa.org/players/4/19/183895.png Argentina https://cdn.sofifa.org/flags/52.png 76 76 New York City FC ... 76.0 48.0 58.0 51.0 6.0 5.0 14.0 14.0 9.0 €10.5M
1680 1680 214327 V. Hernández 29 https://cdn.sofifa.org/players/4/19/214327.png Colombia https://cdn.sofifa.org/flags/56.png 75 75 Atlético Nacional ... 77.0 62.0 20.0 21.0 10.0 15.0 15.0 13.0 11.0 €9.1M
2493 2493 202184 J. Plata 26 https://cdn.sofifa.org/players/4/19/202184.png Ecuador https://cdn.sofifa.org/flags/57.png 74 74 Real Salt Lake ... 70.0 34.0 29.0 27.0 11.0 15.0 9.0 13.0 13.0 €9M

5 rows × 89 columns

In [298]:
# Obtain number of outliers
print(f'Number of outliers: {len(outliers)}')
Number of outliers: 38
In [300]:
# Import zscore function
from scipy.stats import zscore

# Calculate z-score for each data point and compute its absolute value
z_scores = zscore(df['Age'])
abs_z_scores = np.abs(z_scores)

# Select the outliers using a threshold of 3
outliers = df[abs_z_scores > 3]
outliers.head()
Out[300]:
Unnamed: 0 ID Name Age Photo Nationality Flag Overall Potential Club ... Composure Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
41 41 1179 G. Buffon 40 https://cdn.sofifa.org/players/4/19/1179.png Italy https://cdn.sofifa.org/flags/27.png 88 88 Paris Saint-Germain ... 70.0 13.0 11.0 11.0 88.0 87.0 74.0 90.0 83.0 €7.4M
864 864 153260 Hilton 40 https://cdn.sofifa.org/players/4/19/153260.png Brazil https://cdn.sofifa.org/flags/54.png 78 78 Montpellier HSC ... 70.0 83.0 77.0 76.0 12.0 7.0 11.0 12.0 13.0 NaN
1120 1120 156092 J. Villar 41 https://cdn.sofifa.org/players/4/19/156092.png Paraguay https://cdn.sofifa.org/flags/58.png 77 77 NaN ... 55.0 13.0 13.0 14.0 75.0 75.0 74.0 78.0 77.0 NaN
1294 1294 14907 A. Bizzarri 40 https://cdn.sofifa.org/players/4/19/14907.png Argentina https://cdn.sofifa.org/flags/52.png 76 76 Foggia ... 60.0 11.0 12.0 11.0 76.0 74.0 66.0 82.0 76.0 €840K
2821 2821 232543 S. Bertoli 40 https://cdn.sofifa.org/players/4/19/232543.png Argentina https://cdn.sofifa.org/flags/52.png 73 73 Patronato ... 44.0 12.0 13.0 11.0 76.0 73.0 78.0 67.0 71.0 €392K

5 rows × 89 columns

In [301]:
# Obtain number of outliers
print(f'Number of outliers: {len(outliers)}')
Number of outliers: 22
In [303]:
# Import the median_abs_deviation function
from scipy.stats import median_abs_deviation

# Obtain the MAD value
mad_score = median_abs_deviation(df['Height'])
mad_score
Out[303]:
5.079999999999984
In [305]:
# Calculate the percentiles
seventy_fifth = df['Height'].quantile(0.75)
twenty_fifth = df['Height'].quantile(0.25)

# Obtain IQR
iqr = seventy_fifth - twenty_fifth

# Upper and lower thresholds
upper = seventy_fifth + (1.5 * iqr)
lower = twenty_fifth - (1.5 * iqr)

# Subset the dataset
outliers = df[(df['Height'] < lower) | (df['Height'] > upper)]
outliers.head()
Out[305]:
Unnamed: 0 ID Name Age Photo Nationality Flag Overall Potential Club ... Composure Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes Release Clause
463 463 203841 N. Pope 26 https://cdn.sofifa.org/players/4/19/203841.png England https://cdn.sofifa.org/flags/14.png 80 83 Burnley ... 62.0 14.0 15.0 14.0 79.0 80.0 74.0 80.0 80.0 €25.7M
1204 1204 220932 L. Kalinić 28 https://cdn.sofifa.org/players/4/19/220932.png Croatia https://cdn.sofifa.org/flags/10.png 76 78 KAA Gent ... 58.0 13.0 17.0 17.0 81.0 75.0 42.0 76.0 80.0 €10.2M
1340 1340 183895 M. Moralez 31 https://cdn.sofifa.org/players/4/19/183895.png Argentina https://cdn.sofifa.org/flags/52.png 76 76 New York City FC ... 76.0 48.0 58.0 51.0 6.0 5.0 14.0 14.0 9.0 €10.5M
1454 1454 172203 F. Forster 30 https://cdn.sofifa.org/players/4/19/172203.png England https://cdn.sofifa.org/flags/14.png 76 77 Southampton ... 56.0 11.0 13.0 12.0 72.0 76.0 64.0 77.0 78.0 €10.9M
2493 2493 202184 J. Plata 26 https://cdn.sofifa.org/players/4/19/202184.png Ecuador https://cdn.sofifa.org/flags/57.png 74 74 Real Salt Lake ... 70.0 34.0 29.0 27.0 11.0 15.0 9.0 13.0 13.0 €9M

5 rows × 89 columns

In [306]:
# Obtain number of outliers
print(f'Number of outliers: {len(outliers)}')
Number of outliers: 41
In [307]:
# Calculate the median
median_value = df['Height'].median()

# Impute outliers with the median
df_imputed = df.copy()
df_imputed.loc[outliers.index, 'Height'] = median_value
In [308]:
from scipy.stats.mstats import winsorize

df_winsorized = df.copy()
df_winsorized['Total'] = winsorize(df_winsorized['Height'],\
  limits = [0.05, 0.05], inplace = True)
In [309]:
# Create a box plot
g = sns.boxplot(data = df, x = 'Height')

# Add a title and change xlabel
g.set_title('Box Plot of Height')
g.set_xlabel('Height')
Out[309]:
Text(0.5, 0, 'Height')
In [310]:
# Scatter plot
fig, ax = plt.subplots(figsize = (6,4))
ax.scatter(df['Height'],df['Weight'])

# x-axis label
ax.set_xlabel('(Height)')

# y-axis label
ax.set_ylabel('(Weight )')
plt.show()
In [311]:
# Z score
from scipy import stats
import numpy as np

z = np.abs(stats.zscore(df['Height']))
print(z)
0        1.646010
1        0.995907
2        0.891177
3        1.750741
4        0.136343
           ...   
18202    0.891177
18203    1.373324
18204    1.268593
18205    0.513760
18206    0.513760
Name: Height, Length: 18159, dtype: float64
In [312]:
threshold = 2

# Position of the outlier
print(np.where(z > 2))
(array([   14,    19,    29,    63,    92,   102,   109,   115,   151,
         165,   204,   210,   225,   229,   235,   261,   274,   279,
         287,   293,   303,   310,   317,   372,   380,   389,   457,
         463,   466,   467,   544,   545,   582,   634,   639,   704,
         718,   731,   733,   740,   748,   762,   767,   802,   874,
         882,   979,  1010,  1084,  1127,  1132,  1165,  1175,  1193,
        1204,  1213,  1225,  1226,  1249,  1278,  1340,  1364,  1379,
        1411,  1412,  1416,  1425,  1426,  1428,  1429,  1431,  1453,
        1454,  1461,  1475,  1493,  1498,  1550,  1558,  1564,  1581,
        1599,  1623,  1631,  1641,  1680,  1684,  1719,  1744,  1780,
        1795,  1804,  1817,  1819,  1840,  1857,  1859,  1902,  1911,
        1984,  2063,  2075,  2077,  2091,  2094,  2104,  2129,  2177,
        2187,  2203,  2216,  2231,  2262,  2289,  2301,  2329,  2335,
        2344,  2355,  2356,  2357,  2433,  2436,  2493,  2521,  2537,
        2545,  2559,  2612,  2627,  2632,  2664,  2672,  2723,  2726,
        2732,  2736,  2750,  2766,  2768,  2770,  2781,  2792,  2815,
        2818,  2828,  2834,  2859,  2895,  2957,  2977,  2984,  3060,
        3121,  3137,  3146,  3221,  3259,  3326,  3346,  3374,  3379,
        3390,  3394,  3409,  3448,  3466,  3475,  3480,  3506,  3513,
        3533,  3559,  3579,  3588,  3614,  3651,  3657,  3712,  3724,
        3764,  3769,  3801,  3837,  3878,  3886,  3890,  3910,  3979,
        3981,  4017,  4062,  4161,  4174,  4177,  4189,  4204,  4236,
        4247,  4281,  4336,  4338,  4378,  4379,  4390,  4408,  4414,
        4418,  4424,  4428,  4453,  4454,  4460,  4473,  4515,  4526,
        4529,  4542,  4606,  4653,  4712,  4726,  4777,  4782,  4800,
        4860,  4868,  4874,  4945,  4963,  4974,  5013,  5049,  5053,
        5072,  5086,  5103,  5109,  5135,  5145,  5182,  5191,  5193,
        5205,  5211,  5216,  5244,  5285,  5307,  5345,  5350,  5402,
        5422,  5428,  5431,  5441,  5443,  5469,  5479,  5488,  5491,
        5521,  5539,  5544,  5553,  5566,  5569,  5575,  5578,  5579,
        5620,  5622,  5629,  5643,  5669,  5680,  5693,  5696,  5736,
        5785,  5790,  5822,  5828,  5833,  5843,  5851,  5852,  5874,
        5884,  5921,  5935,  5958,  5963,  5983,  5986,  5987,  6008,
        6028,  6038,  6083,  6101,  6110,  6115,  6121,  6124,  6143,
        6145,  6152,  6166,  6173,  6195,  6238,  6256,  6263,  6275,
        6284,  6296,  6304,  6340,  6350,  6353,  6362,  6375,  6376,
        6394,  6424,  6430,  6441,  6455,  6475,  6488,  6535,  6569,
        6574,  6584,  6627,  6628,  6631,  6667,  6673,  6694,  6695,
        6699,  6704,  6713,  6732,  6761,  6782,  6788,  6813,  6814,
        6823,  6880,  6893,  6895,  6915,  6982,  6985,  6994,  7009,
        7012,  7025,  7117,  7126,  7141,  7156,  7174,  7181,  7279,
        7348,  7353,  7356,  7409,  7410,  7413,  7431,  7484,  7509,
        7545,  7558,  7578,  7594,  7613,  7681,  7713,  7762,  7763,
        7785,  7788,  7806,  7809,  7829,  7865,  7909,  7956,  7962,
        7968,  7998,  8005,  8040,  8059,  8117,  8119,  8148,  8151,
        8166,  8168,  8191,  8196,  8200,  8217,  8223,  8269,  8281,
        8321,  8334,  8348,  8389,  8392,  8393,  8410,  8419,  8426,
        8451,  8480,  8495,  8496,  8504,  8533,  8534,  8536,  8646,
        8667,  8674,  8690,  8706,  8727,  8735,  8740,  8766,  8772,
        8781,  8813,  8820,  8838,  8845,  8859,  8867,  8874,  8908,
        8930,  8935,  8943,  8946,  8960,  8991,  9009,  9047,  9060,
        9126,  9159,  9233,  9258,  9287,  9350,  9356,  9363,  9375,
        9384,  9427,  9441,  9458,  9485,  9491,  9510,  9521,  9559,
        9570,  9583,  9588,  9639,  9648,  9666,  9710,  9740,  9757,
        9759,  9768,  9769,  9795,  9809,  9823,  9864,  9907,  9993,
        9997, 10000, 10046, 10075, 10080, 10113, 10157, 10180, 10214,
       10227, 10239, 10249, 10251, 10282, 10348, 10376, 10387, 10426,
       10440, 10465, 10537, 10558, 10637, 10653, 10666, 10699, 10705,
       10747, 10835, 10865, 10875, 10896, 10964, 10974, 11001, 11042,
       11071, 11077, 11096, 11112, 11126, 11127, 11154, 11158, 11199,
       11226, 11277, 11318, 11320, 11340, 11345, 11375, 11376, 11403,
       11418, 11432, 11436, 11487, 11495, 11503, 11517, 11537, 11598,
       11614, 11620, 11625, 11642, 11655, 11767, 11783, 11796, 11820,
       11866, 11903, 11923, 11934, 11935, 11973, 12049, 12068, 12075,
       12124, 12158, 12163, 12169, 12186, 12257, 12262, 12266, 12308,
       12318, 12437, 12458, 12460, 12468, 12470, 12480, 12494, 12531,
       12537, 12564, 12581, 12587, 12590, 12630, 12631, 12632, 12636,
       12666, 12668, 12751, 12781, 12817, 12832, 12853, 12854, 12941,
       12951, 12959, 12967, 12975, 12987, 13063, 13080, 13087, 13109,
       13112, 13178, 13180, 13233, 13237, 13290, 13303, 13308, 13322,
       13328, 13362, 13383, 13407, 13436, 13449, 13472, 13477, 13488,
       13489, 13492, 13493, 13502, 13549, 13556, 13559, 13695, 13738,
       13745, 13747, 13754, 13762, 13806, 13890, 13905, 13917, 13942,
       13950, 14036, 14038, 14053, 14059, 14062, 14063, 14069, 14071,
       14085, 14086, 14113, 14121, 14122, 14143, 14178, 14231, 14245,
       14253, 14256, 14269, 14270, 14296, 14299, 14326, 14335, 14383,
       14388, 14389, 14411, 14414, 14421, 14432, 14463, 14494, 14496,
       14500, 14506, 14518, 14538, 14567, 14568, 14584, 14591, 14602,
       14646, 14647, 14663, 14680, 14727, 14734, 14789, 14791, 14822,
       14846, 14868, 14922, 14932, 14944, 15026, 15050, 15068, 15104,
       15115, 15133, 15175, 15177, 15181, 15192, 15195, 15221, 15228,
       15231, 15246, 15250, 15271, 15278, 15295, 15298, 15301, 15305,
       15306, 15326, 15331, 15335, 15337, 15339, 15365, 15384, 15396,
       15413, 15483, 15485, 15496, 15506, 15507, 15537, 15554, 15572,
       15576, 15588, 15603, 15616, 15637, 15639, 15671, 15716, 15726,
       15749, 15780, 15827, 15830, 15843, 15850, 15855, 15892, 15930,
       15931, 15942, 15989, 16018, 16042, 16044, 16059, 16081, 16083,
       16129, 16161, 16218, 16219, 16236, 16255, 16261, 16291, 16372,
       16401, 16407, 16413, 16428, 16435, 16455, 16483, 16493, 16498,
       16521, 16526, 16531, 16536, 16537, 16546, 16560, 16563, 16582,
       16646, 16647, 16704, 16721, 16752, 16758, 16763, 16790, 16796,
       16801, 16820, 16829, 16854, 16856, 16875, 16877, 16899, 16917,
       16930, 16954, 16990, 17008, 17042, 17051, 17052, 17068, 17069,
       17071, 17084, 17103, 17109, 17127, 17156, 17174, 17180, 17187,
       17221, 17227, 17239, 17296, 17299, 17331, 17334, 17335, 17340,
       17369, 17395, 17398, 17419, 17423, 17449, 17453, 17462, 17484,
       17521, 17542, 17546, 17555, 17571, 17595, 17609, 17646, 17698,
       17725, 17738, 17770, 17801, 17813, 17872, 17873, 17879, 17880,
       17906, 18002, 18022, 18046, 18110, 18111, 18151], dtype=int64),)
In [314]:
# IQR
Q1 = np.percentile(df['Height'], 25, method='midpoint')
Q3 = np.percentile(df['Height'], 75, method='midpoint')
IQR = Q3 - Q1
print(IQR)
10.159999999999997
In [315]:
# Above Upper bound
upper=Q3+1.5*IQR
upper_array=np.array(df['Height']>=upper)
print("Upper Bound:",upper)
print(upper_array.sum())

#Below Lower bound
lower=Q1-1.5*IQR
lower_array=np.array(df['Height']<=lower)
print("Lower Bound:",lower)
print(lower_array.sum())
Upper Bound: 200.65999999999997
33
Lower Bound: 160.01999999999998
8
In [316]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
In [317]:
from sklearn.ensemble import IsolationForest
In [319]:
df['Height'].hist(figsize=(15,10));
In [320]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
In [323]:
sns.heatmap(df.corr(), annot=True)
Out[323]:
<Axes: >
In [325]:
ax = sns.lmplot(x = "Height",
               y = "Weight",
               data = df, hue = "Age", fit_reg = False, height = 5, aspect = 2.2)
sns.regplot(x = "Height",
               y = "Weight",
               data = df, scatter=False, ax=ax.axes[0, 0], order = 3)
plt.ylabel("Weight")
plt.xticks(list(range(1,30)), list(df['Height'].unique()))
plt.title("Relationship Between Height and Weight", fontsize=18)
plt.xlabel("Height", fontsize=14)
plt.ylabel("Weight", fontsize=14)
plt.show()
---------------------------------------------------------------------------
UFuncTypeError                            Traceback (most recent call last)
Cell In[325], line 4
      1 ax = sns.lmplot(x = "Height",
      2                y = "Weight",
      3                data = df, hue = "Age", fit_reg = False, height = 5, aspect = 2.2)
----> 4 sns.regplot(x = "Height",
      5                y = "Weight",
      6                data = df, scatter=False, ax=ax.axes[0, 0], order = 3)
      7 plt.ylabel("Weight")
      8 plt.xticks(list(range(1,30)), list(df['Height'].unique()))

File ~\anaconda3\Lib\site-packages\seaborn\regression.py:759, in regplot(data, x, y, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, seed, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, dropna, x_jitter, y_jitter, label, color, marker, scatter_kws, line_kws, ax)
    757 scatter_kws["marker"] = marker
    758 line_kws = {} if line_kws is None else copy.copy(line_kws)
--> 759 plotter.plot(ax, scatter_kws, line_kws)
    760 return ax

File ~\anaconda3\Lib\site-packages\seaborn\regression.py:368, in _RegressionPlotter.plot(self, ax, scatter_kws, line_kws)
    365     self.scatterplot(ax, scatter_kws)
    367 if self.fit_reg:
--> 368     self.lineplot(ax, line_kws)
    370 # Label the axes
    371 if hasattr(self.x, "name"):

File ~\anaconda3\Lib\site-packages\seaborn\regression.py:413, in _RegressionPlotter.lineplot(self, ax, kws)
    411 """Draw the model."""
    412 # Fit the regression model
--> 413 grid, yhat, err_bands = self.fit_regression(ax)
    414 edges = grid[0], grid[-1]
    416 # Get set default aesthetics

File ~\anaconda3\Lib\site-packages\seaborn\regression.py:204, in _RegressionPlotter.fit_regression(self, ax, x_range, grid)
    202 # Fit the regression
    203 if self.order > 1:
--> 204     yhat, yhat_boots = self.fit_poly(grid, self.order)
    205 elif self.logistic:
    206     from statsmodels.genmod.generalized_linear_model import GLM

File ~\anaconda3\Lib\site-packages\seaborn\regression.py:254, in _RegressionPlotter.fit_poly(self, grid, order)
    251     return np.polyval(np.polyfit(_x, _y, order), grid)
    253 x, y = self.x, self.y
--> 254 yhat = reg_func(x, y)
    255 if self.ci is None:
    256     return yhat, None

File ~\anaconda3\Lib\site-packages\seaborn\regression.py:251, in _RegressionPlotter.fit_poly.<locals>.reg_func(_x, _y)
    250 def reg_func(_x, _y):
--> 251     return np.polyval(np.polyfit(_x, _y, order), grid)

File <__array_function__ internals>:200, in polyfit(*args, **kwargs)

File ~\anaconda3\Lib\site-packages\numpy\lib\polynomial.py:668, in polyfit(x, y, deg, rcond, full, w, cov)
    666 scale = NX.sqrt((lhs*lhs).sum(axis=0))
    667 lhs /= scale
--> 668 c, resids, rank, s = lstsq(lhs, rhs, rcond)
    669 c = (c.T/scale).T  # broadcast scale coefficients
    671 # warn on rank reduction, which indicates an ill conditioned matrix

File <__array_function__ internals>:200, in lstsq(*args, **kwargs)

File ~\anaconda3\Lib\site-packages\numpy\linalg\linalg.py:2285, in lstsq(a, b, rcond)
   2282 if n_rhs == 0:
   2283     # lapack can't handle n_rhs = 0 - so allocate the array one larger in that axis
   2284     b = zeros(b.shape[:-2] + (m, n_rhs + 1), dtype=b.dtype)
-> 2285 x, resids, rank, s = gufunc(a, b, rcond, signature=signature, extobj=extobj)
   2286 if m == 0:
   2287     x[...] = 0

UFuncTypeError: Cannot cast ufunc 'lstsq_n' input 1 from dtype('O') to dtype('float64') with casting rule 'same_kind'
In [326]:
# Find the pearson correlations matrix
corr = df.corr(method = 'pearson')
corr
Out[326]:
Unnamed: 0 ID Age Overall Potential Special International Reputation Weak Foot Skill Moves Jersey Number ... Penalties Composure Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes
Unnamed: 0 1.000000 0.416108 -0.455707 -0.972796 -0.633444 -0.596913 -0.413322 -0.204026 -0.415672 0.213645 ... -0.337899 -0.716173 -0.280829 -0.248564 -0.220034 0.027757 0.027026 0.030654 0.019842 0.025610
ID 0.416108 1.000000 -0.739162 -0.417354 0.046577 -0.231366 -0.356191 -0.075784 -0.056914 0.182074 ... -0.140657 -0.384473 -0.110198 -0.085929 -0.068409 -0.105594 -0.111149 -0.106652 -0.118250 -0.105778
Age -0.455707 -0.739162 1.000000 0.453069 -0.252281 0.236875 0.253765 0.059867 0.027649 -0.241156 ... 0.139535 0.391023 0.142817 0.119745 0.103089 0.101277 0.106419 0.104964 0.116402 0.103313
Overall -0.972796 -0.417354 0.453069 1.000000 0.661180 0.607236 0.499491 0.212015 0.414463 -0.218931 ... 0.341429 0.727655 0.286505 0.252629 0.222811 -0.025937 -0.025062 -0.029372 -0.017674 -0.023276
Potential -0.633444 0.046577 -0.252281 0.661180 1.000000 0.384598 0.372993 0.162346 0.354290 -0.010474 ... 0.224281 0.440008 0.162801 0.143564 0.128980 -0.053446 -0.054672 -0.059061 -0.052589 -0.053341
Special -0.596913 -0.231366 0.236875 0.607236 0.384598 1.000000 0.292208 0.341855 0.763412 -0.133716 ... 0.734533 0.752331 0.561866 0.538802 0.506968 -0.674637 -0.673625 -0.670254 -0.668272 -0.673238
International Reputation -0.413322 -0.356191 0.253765 0.499491 0.372993 0.292208 1.000000 0.128317 0.208153 -0.077298 ... 0.218620 0.392787 0.115208 0.092846 0.079176 0.004526 0.003942 0.000651 0.006904 0.003444
Weak Foot -0.204026 -0.075784 0.059867 0.212015 0.162346 0.341855 0.128317 1.000000 0.340721 -0.035410 ... 0.330252 0.278132 0.065673 0.042646 0.026105 -0.231905 -0.233098 -0.229395 -0.231298 -0.232574
Skill Moves -0.415672 -0.056914 0.027649 0.414463 0.354290 0.763412 0.208153 0.340721 1.000000 -0.035194 ... 0.690434 0.586836 0.241428 0.210517 0.178607 -0.621675 -0.619755 -0.616990 -0.618853 -0.621925
Jersey Number 0.213645 0.182074 -0.241156 -0.218931 -0.010474 -0.133716 -0.077298 -0.035410 -0.035194 1.000000 ... -0.028023 -0.167523 -0.142474 -0.133285 -0.124610 0.004807 0.001543 0.001162 -0.002736 0.003255
Height -0.034734 -0.090199 0.082604 0.038546 -0.009807 -0.382862 0.034881 -0.174793 -0.422753 -0.039469 ... -0.340664 -0.135785 -0.073733 -0.058877 -0.066869 0.360594 0.360796 0.358780 0.361916 0.362636
Weight -0.150844 -0.191425 0.230213 0.154634 -0.006947 -0.267830 0.088340 -0.130724 -0.351209 -0.087319 ... -0.253387 -0.034444 -0.049356 -0.046835 -0.056164 0.340034 0.339024 0.337717 0.342178 0.341135
Crossing -0.390062 -0.131994 0.130545 0.394972 0.246319 0.866417 0.191770 0.307925 0.741035 -0.076585 ... 0.645805 0.575446 0.443101 0.428963 0.409961 -0.663053 -0.660193 -0.659767 -0.660160 -0.662539
Finishing -0.325529 -0.082323 0.068660 0.332515 0.243355 0.724244 0.178373 0.357416 0.743439 -0.006639 ... 0.837827 0.533414 0.024218 -0.033023 -0.071811 -0.588752 -0.587145 -0.583268 -0.584852 -0.586913
HeadingAccuracy -0.337766 -0.106815 0.147183 0.340776 0.200988 0.644421 0.157483 0.183238 0.443005 -0.091688 ... 0.551978 0.507208 0.583123 0.561063 0.533643 -0.750417 -0.749888 -0.746444 -0.744443 -0.748895
ShortPassing -0.492495 -0.136279 0.132894 0.502550 0.369189 0.906729 0.242803 0.322133 0.730363 -0.100241 ... 0.676063 0.685137 0.559576 0.541131 0.508644 -0.729785 -0.728024 -0.724381 -0.723782 -0.728721
Volleys -0.384285 -0.159915 0.142472 0.391338 0.254906 0.773974 0.243089 0.357340 0.745077 -0.026731 ... 0.829257 0.595281 0.120919 0.072788 0.035457 -0.590808 -0.588668 -0.584954 -0.586131 -0.588670
Dribbling -0.364106 -0.030340 0.010166 0.372426 0.315019 0.874274 0.179041 0.352658 0.839757 -0.028021 ... 0.769594 0.597498 0.336072 0.301251 0.273963 -0.754625 -0.753181 -0.749816 -0.751348 -0.754341
Curve -0.416378 -0.169511 0.143276 0.419491 0.279944 0.851900 0.233681 0.345468 0.771052 -0.055428 ... 0.751833 0.616532 0.289529 0.261481 0.232869 -0.606286 -0.603141 -0.600266 -0.603540 -0.604960
FKAccuracy -0.395433 -0.199549 0.193467 0.396892 0.230544 0.806414 0.223564 0.330472 0.701068 -0.068843 ... 0.734440 0.585120 0.297976 0.279153 0.247903 -0.556605 -0.553644 -0.549911 -0.552641 -0.554920
LongPassing -0.477500 -0.186764 0.181310 0.483909 0.321437 0.846302 0.239525 0.277174 0.622342 -0.117424 ... 0.542247 0.645797 0.587106 0.587430 0.562230 -0.596820 -0.594999 -0.591453 -0.591561 -0.595887
BallControl -0.449655 -0.100184 0.084969 0.460197 0.354396 0.912107 0.217946 0.356383 0.818051 -0.073210 ... 0.769791 0.674881 0.452705 0.417566 0.384802 -0.788444 -0.786797 -0.783423 -0.783607 -0.787939
Acceleration -0.185030 0.133236 -0.158667 0.196869 0.234608 0.654337 0.044319 0.261435 0.652356 -0.004395 ... 0.532908 0.347427 0.195369 0.163000 0.157565 -0.593008 -0.594866 -0.592127 -0.592143 -0.593201
SprintSpeed -0.198797 0.132437 -0.151682 0.210647 0.236771 0.645963 0.044070 0.248822 0.624098 -0.015069 ... 0.521071 0.351607 0.212575 0.178214 0.171980 -0.597677 -0.599694 -0.597320 -0.596498 -0.597837
Agility -0.256270 -0.019897 -0.019395 0.264952 0.222310 0.699673 0.100869 0.302062 0.681765 -0.034158 ... 0.566175 0.432511 0.167122 0.129204 0.116686 -0.527756 -0.528482 -0.527164 -0.526983 -0.528899
Reactions -0.832156 -0.408617 0.453124 0.850045 0.513425 0.597169 0.445614 0.201341 0.377044 -0.192622 ... 0.346143 0.685558 0.283607 0.255399 0.228355 -0.062967 -0.061940 -0.065927 -0.055031 -0.059961
Balance -0.097160 0.048463 -0.089877 0.103160 0.138025 0.586788 0.050076 0.254022 0.578459 0.008009 ... 0.482794 0.310763 0.178695 0.154045 0.152470 -0.504727 -0.506102 -0.503970 -0.503652 -0.505974
ShotPower -0.440023 -0.166133 0.156947 0.441118 0.288318 0.835277 0.227772 0.332855 0.718237 -0.053860 ... 0.795220 0.634495 0.296944 0.256403 0.220237 -0.654117 -0.654099 -0.649403 -0.651409 -0.653475
Jumping -0.261581 -0.169369 0.177167 0.264435 0.109151 0.321846 0.120931 0.069752 0.107553 -0.104179 ... 0.133294 0.252353 0.279196 0.260645 0.260261 -0.192700 -0.193692 -0.195282 -0.189079 -0.192050
Stamina -0.358451 -0.053895 0.097793 0.365656 0.202563 0.792762 0.094780 0.232094 0.570226 -0.127822 ... 0.516426 0.523112 0.587782 0.570055 0.544702 -0.701467 -0.698556 -0.696729 -0.696073 -0.699670
Strength -0.342839 -0.259756 0.332798 0.349326 0.075769 0.192990 0.131280 -0.008470 -0.041475 -0.158411 ... 0.054491 0.280522 0.333334 0.332159 0.304849 -0.111012 -0.109660 -0.110253 -0.103878 -0.107497
LongShots -0.417853 -0.161549 0.155096 0.420795 0.266740 0.840049 0.213960 0.355967 0.752980 -0.046174 ... 0.812446 0.616102 0.215510 0.172331 0.133603 -0.612381 -0.610739 -0.605952 -0.607200 -0.610087
Aggression -0.397067 -0.228329 0.265190 0.395470 0.171174 0.666236 0.173327 0.131524 0.347795 -0.146907 ... 0.336089 0.515776 0.723961 0.744216 0.721384 -0.575843 -0.576114 -0.573607 -0.571201 -0.575142
Interceptions -0.319162 -0.160602 0.197845 0.321326 0.154908 0.561676 0.129586 0.053097 0.209604 -0.158526 ... 0.110834 0.397450 0.888349 0.941471 0.928282 -0.485585 -0.486324 -0.485394 -0.481279 -0.486036
Positioning -0.351820 -0.088330 0.082443 0.356493 0.245616 0.824307 0.183003 0.346896 0.781248 -0.025422 ... 0.801268 0.580498 0.202597 0.158060 0.124228 -0.679480 -0.677699 -0.674393 -0.675569 -0.678582
Vision -0.490296 -0.215170 0.187422 0.498894 0.348141 0.761992 0.284600 0.337897 0.674057 -0.078050 ... 0.632927 0.636280 0.176760 0.146460 0.113228 -0.381899 -0.377807 -0.374737 -0.375775 -0.381158
Penalties -0.337899 -0.140657 0.139535 0.341429 0.224281 0.734533 0.218620 0.330252 0.690434 -0.028023 ... 1.000000 0.551801 0.152296 0.101920 0.066693 -0.620069 -0.618968 -0.614006 -0.617074 -0.619099
Composure -0.716173 -0.384473 0.391023 0.727655 0.440008 0.752331 0.392787 0.278132 0.586836 -0.167523 ... 0.551801 1.000000 0.384081 0.351726 0.317492 -0.378750 -0.375720 -0.374897 -0.370234 -0.377626
Marking -0.280829 -0.110198 0.142817 0.286505 0.162801 0.561866 0.115208 0.065673 0.241428 -0.142474 ... 0.152296 0.384081 1.000000 0.906541 0.895908 -0.550978 -0.552263 -0.549498 -0.546670 -0.551290
StandingTackle -0.248564 -0.085929 0.119745 0.252629 0.143564 0.538802 0.092846 0.042646 0.210517 -0.133285 ... 0.101920 0.351726 0.906541 1.000000 0.974659 -0.530989 -0.532160 -0.531092 -0.527792 -0.531474
SlidingTackle -0.220034 -0.068409 0.103089 0.222811 0.128980 0.506968 0.079176 0.026105 0.178607 -0.124610 ... 0.066693 0.317492 0.895908 0.974659 1.000000 -0.509337 -0.510591 -0.509378 -0.505792 -0.509425
GKDiving 0.027757 -0.105594 0.101277 -0.025937 -0.053446 -0.674637 0.004526 -0.231905 -0.621675 0.004807 ... -0.620069 -0.378750 -0.550978 -0.530989 -0.509337 1.000000 0.970280 0.965685 0.969864 0.973320
GKHandling 0.027026 -0.111149 0.106419 -0.025062 -0.054672 -0.673625 0.003942 -0.233098 -0.619755 0.001543 ... -0.618968 -0.375720 -0.552263 -0.532160 -0.510591 0.970280 1.000000 0.965239 0.969408 0.970264
GKKicking 0.030654 -0.106652 0.104964 -0.029372 -0.059061 -0.670254 0.000651 -0.229395 -0.616990 0.001162 ... -0.614006 -0.374897 -0.549498 -0.531092 -0.509378 0.965685 0.965239 1.000000 0.964336 0.966337
GKPositioning 0.019842 -0.118250 0.116402 -0.017674 -0.052589 -0.668272 0.006904 -0.231298 -0.618853 -0.002736 ... -0.617074 -0.370234 -0.546670 -0.527792 -0.505792 0.969864 0.969408 0.964336 1.000000 0.970130
GKReflexes 0.025610 -0.105778 0.103313 -0.023276 -0.053341 -0.673238 0.003444 -0.232574 -0.621925 0.003255 ... -0.619099 -0.377626 -0.551290 -0.531474 -0.509425 0.973320 0.970264 0.966337 0.970130 1.000000

46 rows × 46 columns

In [327]:
# correaltions between Potential and Overall columns
c = np.corrcoef(df['Potential'],df['Overall'])
print('Correlations between Potential and Overall\n',c)
Correlations between Potential and Overall
 [[1.         0.66118012]
 [0.66118012 1.        ]]
In [328]:
plt.figure(figsize=(10,8), dpi =500)
sns.heatmap(corr,annot=True,fmt=".2f", linewidth=.5)
plt.show()
In [329]:
# correaltions between Potential and Overall columns
c = np.corrcoef(df['Age'],df['Overall'])
print('Correlations between Age and Overall\n',c)
Correlations between Age and Overall
 [[1.         0.45306932]
 [0.45306932 1.        ]]
In [330]:
#bestfit line
sns.lmplot(x="Age", y="Overall", data=df);
In [331]:
#scatterplot
import seaborn as sns
sns.scatterplot(x="Age", y="Overall", data=df);
In [333]:
sns.lmplot(x="Age", y="Overall", hue="Potential", data=df);
In [336]:
#Corrleation matrix
cormat = df.corr()
round(cormat,2)
Out[336]:
Unnamed: 0 ID Age Overall Potential Special International Reputation Weak Foot Skill Moves Jersey Number ... Penalties Composure Marking StandingTackle SlidingTackle GKDiving GKHandling GKKicking GKPositioning GKReflexes
Unnamed: 0 1.00 0.42 -0.46 -0.97 -0.63 -0.60 -0.41 -0.20 -0.42 0.21 ... -0.34 -0.72 -0.28 -0.25 -0.22 0.03 0.03 0.03 0.02 0.03
ID 0.42 1.00 -0.74 -0.42 0.05 -0.23 -0.36 -0.08 -0.06 0.18 ... -0.14 -0.38 -0.11 -0.09 -0.07 -0.11 -0.11 -0.11 -0.12 -0.11
Age -0.46 -0.74 1.00 0.45 -0.25 0.24 0.25 0.06 0.03 -0.24 ... 0.14 0.39 0.14 0.12 0.10 0.10 0.11 0.10 0.12 0.10
Overall -0.97 -0.42 0.45 1.00 0.66 0.61 0.50 0.21 0.41 -0.22 ... 0.34 0.73 0.29 0.25 0.22 -0.03 -0.03 -0.03 -0.02 -0.02
Potential -0.63 0.05 -0.25 0.66 1.00 0.38 0.37 0.16 0.35 -0.01 ... 0.22 0.44 0.16 0.14 0.13 -0.05 -0.05 -0.06 -0.05 -0.05
Special -0.60 -0.23 0.24 0.61 0.38 1.00 0.29 0.34 0.76 -0.13 ... 0.73 0.75 0.56 0.54 0.51 -0.67 -0.67 -0.67 -0.67 -0.67
International Reputation -0.41 -0.36 0.25 0.50 0.37 0.29 1.00 0.13 0.21 -0.08 ... 0.22 0.39 0.12 0.09 0.08 0.00 0.00 0.00 0.01 0.00
Weak Foot -0.20 -0.08 0.06 0.21 0.16 0.34 0.13 1.00 0.34 -0.04 ... 0.33 0.28 0.07 0.04 0.03 -0.23 -0.23 -0.23 -0.23 -0.23
Skill Moves -0.42 -0.06 0.03 0.41 0.35 0.76 0.21 0.34 1.00 -0.04 ... 0.69 0.59 0.24 0.21 0.18 -0.62 -0.62 -0.62 -0.62 -0.62
Jersey Number 0.21 0.18 -0.24 -0.22 -0.01 -0.13 -0.08 -0.04 -0.04 1.00 ... -0.03 -0.17 -0.14 -0.13 -0.12 0.00 0.00 0.00 -0.00 0.00
Height -0.03 -0.09 0.08 0.04 -0.01 -0.38 0.03 -0.17 -0.42 -0.04 ... -0.34 -0.14 -0.07 -0.06 -0.07 0.36 0.36 0.36 0.36 0.36
Weight -0.15 -0.19 0.23 0.15 -0.01 -0.27 0.09 -0.13 -0.35 -0.09 ... -0.25 -0.03 -0.05 -0.05 -0.06 0.34 0.34 0.34 0.34 0.34
Crossing -0.39 -0.13 0.13 0.39 0.25 0.87 0.19 0.31 0.74 -0.08 ... 0.65 0.58 0.44 0.43 0.41 -0.66 -0.66 -0.66 -0.66 -0.66
Finishing -0.33 -0.08 0.07 0.33 0.24 0.72 0.18 0.36 0.74 -0.01 ... 0.84 0.53 0.02 -0.03 -0.07 -0.59 -0.59 -0.58 -0.58 -0.59
HeadingAccuracy -0.34 -0.11 0.15 0.34 0.20 0.64 0.16 0.18 0.44 -0.09 ... 0.55 0.51 0.58 0.56 0.53 -0.75 -0.75 -0.75 -0.74 -0.75
ShortPassing -0.49 -0.14 0.13 0.50 0.37 0.91 0.24 0.32 0.73 -0.10 ... 0.68 0.69 0.56 0.54 0.51 -0.73 -0.73 -0.72 -0.72 -0.73
Volleys -0.38 -0.16 0.14 0.39 0.25 0.77 0.24 0.36 0.75 -0.03 ... 0.83 0.60 0.12 0.07 0.04 -0.59 -0.59 -0.58 -0.59 -0.59
Dribbling -0.36 -0.03 0.01 0.37 0.32 0.87 0.18 0.35 0.84 -0.03 ... 0.77 0.60 0.34 0.30 0.27 -0.75 -0.75 -0.75 -0.75 -0.75
Curve -0.42 -0.17 0.14 0.42 0.28 0.85 0.23 0.35 0.77 -0.06 ... 0.75 0.62 0.29 0.26 0.23 -0.61 -0.60 -0.60 -0.60 -0.60
FKAccuracy -0.40 -0.20 0.19 0.40 0.23 0.81 0.22 0.33 0.70 -0.07 ... 0.73 0.59 0.30 0.28 0.25 -0.56 -0.55 -0.55 -0.55 -0.55
LongPassing -0.48 -0.19 0.18 0.48 0.32 0.85 0.24 0.28 0.62 -0.12 ... 0.54 0.65 0.59 0.59 0.56 -0.60 -0.59 -0.59 -0.59 -0.60
BallControl -0.45 -0.10 0.08 0.46 0.35 0.91 0.22 0.36 0.82 -0.07 ... 0.77 0.67 0.45 0.42 0.38 -0.79 -0.79 -0.78 -0.78 -0.79
Acceleration -0.19 0.13 -0.16 0.20 0.23 0.65 0.04 0.26 0.65 -0.00 ... 0.53 0.35 0.20 0.16 0.16 -0.59 -0.59 -0.59 -0.59 -0.59
SprintSpeed -0.20 0.13 -0.15 0.21 0.24 0.65 0.04 0.25 0.62 -0.02 ... 0.52 0.35 0.21 0.18 0.17 -0.60 -0.60 -0.60 -0.60 -0.60
Agility -0.26 -0.02 -0.02 0.26 0.22 0.70 0.10 0.30 0.68 -0.03 ... 0.57 0.43 0.17 0.13 0.12 -0.53 -0.53 -0.53 -0.53 -0.53
Reactions -0.83 -0.41 0.45 0.85 0.51 0.60 0.45 0.20 0.38 -0.19 ... 0.35 0.69 0.28 0.26 0.23 -0.06 -0.06 -0.07 -0.06 -0.06
Balance -0.10 0.05 -0.09 0.10 0.14 0.59 0.05 0.25 0.58 0.01 ... 0.48 0.31 0.18 0.15 0.15 -0.50 -0.51 -0.50 -0.50 -0.51
ShotPower -0.44 -0.17 0.16 0.44 0.29 0.84 0.23 0.33 0.72 -0.05 ... 0.80 0.63 0.30 0.26 0.22 -0.65 -0.65 -0.65 -0.65 -0.65
Jumping -0.26 -0.17 0.18 0.26 0.11 0.32 0.12 0.07 0.11 -0.10 ... 0.13 0.25 0.28 0.26 0.26 -0.19 -0.19 -0.20 -0.19 -0.19
Stamina -0.36 -0.05 0.10 0.37 0.20 0.79 0.09 0.23 0.57 -0.13 ... 0.52 0.52 0.59 0.57 0.54 -0.70 -0.70 -0.70 -0.70 -0.70
Strength -0.34 -0.26 0.33 0.35 0.08 0.19 0.13 -0.01 -0.04 -0.16 ... 0.05 0.28 0.33 0.33 0.30 -0.11 -0.11 -0.11 -0.10 -0.11
LongShots -0.42 -0.16 0.16 0.42 0.27 0.84 0.21 0.36 0.75 -0.05 ... 0.81 0.62 0.22 0.17 0.13 -0.61 -0.61 -0.61 -0.61 -0.61
Aggression -0.40 -0.23 0.27 0.40 0.17 0.67 0.17 0.13 0.35 -0.15 ... 0.34 0.52 0.72 0.74 0.72 -0.58 -0.58 -0.57 -0.57 -0.58
Interceptions -0.32 -0.16 0.20 0.32 0.15 0.56 0.13 0.05 0.21 -0.16 ... 0.11 0.40 0.89 0.94 0.93 -0.49 -0.49 -0.49 -0.48 -0.49
Positioning -0.35 -0.09 0.08 0.36 0.25 0.82 0.18 0.35 0.78 -0.03 ... 0.80 0.58 0.20 0.16 0.12 -0.68 -0.68 -0.67 -0.68 -0.68
Vision -0.49 -0.22 0.19 0.50 0.35 0.76 0.28 0.34 0.67 -0.08 ... 0.63 0.64 0.18 0.15 0.11 -0.38 -0.38 -0.37 -0.38 -0.38
Penalties -0.34 -0.14 0.14 0.34 0.22 0.73 0.22 0.33 0.69 -0.03 ... 1.00 0.55 0.15 0.10 0.07 -0.62 -0.62 -0.61 -0.62 -0.62
Composure -0.72 -0.38 0.39 0.73 0.44 0.75 0.39 0.28 0.59 -0.17 ... 0.55 1.00 0.38 0.35 0.32 -0.38 -0.38 -0.37 -0.37 -0.38
Marking -0.28 -0.11 0.14 0.29 0.16 0.56 0.12 0.07 0.24 -0.14 ... 0.15 0.38 1.00 0.91 0.90 -0.55 -0.55 -0.55 -0.55 -0.55
StandingTackle -0.25 -0.09 0.12 0.25 0.14 0.54 0.09 0.04 0.21 -0.13 ... 0.10 0.35 0.91 1.00 0.97 -0.53 -0.53 -0.53 -0.53 -0.53
SlidingTackle -0.22 -0.07 0.10 0.22 0.13 0.51 0.08 0.03 0.18 -0.12 ... 0.07 0.32 0.90 0.97 1.00 -0.51 -0.51 -0.51 -0.51 -0.51
GKDiving 0.03 -0.11 0.10 -0.03 -0.05 -0.67 0.00 -0.23 -0.62 0.00 ... -0.62 -0.38 -0.55 -0.53 -0.51 1.00 0.97 0.97 0.97 0.97
GKHandling 0.03 -0.11 0.11 -0.03 -0.05 -0.67 0.00 -0.23 -0.62 0.00 ... -0.62 -0.38 -0.55 -0.53 -0.51 0.97 1.00 0.97 0.97 0.97
GKKicking 0.03 -0.11 0.10 -0.03 -0.06 -0.67 0.00 -0.23 -0.62 0.00 ... -0.61 -0.37 -0.55 -0.53 -0.51 0.97 0.97 1.00 0.96 0.97
GKPositioning 0.02 -0.12 0.12 -0.02 -0.05 -0.67 0.01 -0.23 -0.62 -0.00 ... -0.62 -0.37 -0.55 -0.53 -0.51 0.97 0.97 0.96 1.00 0.97
GKReflexes 0.03 -0.11 0.10 -0.02 -0.05 -0.67 0.00 -0.23 -0.62 0.00 ... -0.62 -0.38 -0.55 -0.53 -0.51 0.97 0.97 0.97 0.97 1.00

46 rows × 46 columns

In [337]:
#Correlation matrix to heat map
sns.heatmap(cormat);
In [341]:
import pingouin as pg
pg.corr(x=df['Potential'], y=df['Overall'])
---------------------------------------------------------------------------
FloatingPointError                        Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_4616\4157861621.py in ?()
      1 import pingouin as pg
----> 2 pg.corr(x=df['Potential'], y=df['Overall'])

~\anaconda3\Lib\site-packages\pingouin\correlation.py in ?(x, y, alternative, method, **kwargs)
    600     n = x.size
    601 
    602     # Compute correlation coefficient and two-sided p-value
    603     if method == "pearson":
--> 604         r, pval = pearsonr(x, y)
    605     elif method == "spearman":
    606         r, pval = spearmanr(x, y, **kwargs)
    607     elif method == "kendall":

~\anaconda3\Lib\site-packages\scipy\stats\_stats_py.py in ?(x, y, alternative, method)
   4856     # hypothesis is the beta distribution on (-1, 1) with a = b = n/2 - 1.
   4857     ab = n/2 - 1
   4858     dist = stats.beta(ab, ab, loc=-1, scale=2)
   4859     if alternative == 'two-sided':
-> 4860         prob = 2*dist.sf(abs(r))
   4861     elif alternative == 'less':
   4862         prob = dist.cdf(r)
   4863     elif alternative == 'greater':

~\anaconda3\Lib\site-packages\scipy\stats\_distn_infrastructure.py in ?(self, x)
    493     def sf(self, x):
--> 494         return self.dist.sf(x, *self.args, **self.kwds)

~\anaconda3\Lib\site-packages\scipy\stats\_distn_infrastructure.py in ?(self, x, *args, **kwds)
   2155         place(output, (1-cond0)+np.isnan(x), self.badvalue)
   2156         place(output, cond2, 1.0)
   2157         if np.any(cond):
   2158             goodargs = argsreduce(cond, *((x,)+args))
-> 2159             place(output, cond, self._sf(*goodargs))
   2160         if output.ndim == 0:
   2161             return output[()]
   2162         return output

~\anaconda3\Lib\site-packages\scipy\stats\_continuous_distns.py in ?(self, x, a, b)
    692     def _sf(self, x, a, b):
--> 693         return _boost._beta_sf(x, a, b)

FloatingPointError: underflow encountered in _beta_sf
In [ ]: